Feed Hunter: deep scraper skill, pipeline, simulator, first investigation
- Built deep-scraper skill (CDP-based X feed extraction) - Three-stage pipeline: scrape → triage → investigate - Paper trading simulator with position tracking - First live investigation: verified kch123 Polymarket profile ($9.3M P&L) - Opened first paper position: Seahawks Super Bowl @ 68c - Telegram alerts with inline action buttons - Portal build in progress (night shift)
This commit is contained in:
281
skills/deep-scraper/scripts/triage-posts.py
Normal file
281
skills/deep-scraper/scripts/triage-posts.py
Normal file
@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Triage scraped X posts — identify posts with verifiable claims and links.
|
||||
Extracts structured "investigation tasks" for agent follow-up.
|
||||
|
||||
Usage:
|
||||
python3 triage-posts.py <posts.json> [--output triage.json]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# Patterns that suggest a verifiable claim
|
||||
CLAIM_PATTERNS = [
|
||||
# Performance claims
|
||||
(r'(\d+[\d.]*)\s*%\s*(win|success|profit|return|accuracy|hit rate)', 'performance_claim'),
|
||||
(r'wins?\s+(\d+[\d.]*)\s*%', 'performance_claim'),
|
||||
(r'(\d+[\d.]*)\s*%\s+of the time', 'performance_claim'),
|
||||
(r'(\d+[\d.]*)x\s+(return|profit|gain)', 'multiplier_claim'),
|
||||
|
||||
# Copy/follow trading
|
||||
(r'copy(ing|cat)?\s+(trader|user|bet|position|strat)', 'copy_trading'),
|
||||
(r'follow\s+(this|my|their)\s+(trade|bet|position|strat)', 'copy_trading'),
|
||||
(r'mirror(ing)?\s+(trade|bet|position)', 'copy_trading'),
|
||||
|
||||
# Arbitrage/spread
|
||||
(r'(arb|arbitrage|spread|mismatch|mispriced)', 'arbitrage_opp'),
|
||||
(r'risk[\s-]?free', 'arbitrage_opp'),
|
||||
(r'guaranteed\s+(profit|return|money)', 'arbitrage_opp'),
|
||||
|
||||
# Prediction/betting
|
||||
(r'(polymarket|kalshi|manifold|prediction\s+market)', 'prediction_market'),
|
||||
(r'(odds|probability)\s+.{0,20}\s*(\d+[\d.]*)\s*%', 'odds_claim'),
|
||||
(r'(yes|no)\s+shares?\s+at\s+(\d+)', 'shares_price'),
|
||||
|
||||
# Price/target claims
|
||||
(r'(target|pt|price target)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'price_target'),
|
||||
(r'(entry|buy)\s*(at|zone|point)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'entry_point'),
|
||||
|
||||
# Airdrop/free money
|
||||
(r'(airdrop|free\s+money|free\s+tokens?|claiming)', 'airdrop'),
|
||||
(r'(step\s+\d|how\s+to\s+(get|claim|earn))', 'howto'),
|
||||
|
||||
# User/account references
|
||||
(r'@\w+.*?(portfolio|track\s*record|history|performance)', 'user_reference'),
|
||||
(r'(this\s+(guy|trader|user|person|account))\s+.{0,30}(profit|win|return|made)', 'user_reference'),
|
||||
]
|
||||
|
||||
# Link domains that are investigatable
|
||||
INVESTIGATABLE_DOMAINS = {
|
||||
'polymarket.com': 'prediction_market',
|
||||
'kalshi.com': 'prediction_market',
|
||||
'manifold.markets': 'prediction_market',
|
||||
'dexscreener.com': 'token_chart',
|
||||
'dextools.io': 'token_chart',
|
||||
'birdeye.so': 'token_chart',
|
||||
'coingecko.com': 'token_info',
|
||||
'coinmarketcap.com': 'token_info',
|
||||
'tradingview.com': 'chart',
|
||||
'etherscan.io': 'blockchain',
|
||||
'solscan.io': 'blockchain',
|
||||
'basescan.org': 'blockchain',
|
||||
'github.com': 'code_repo',
|
||||
'docs.google.com': 'document',
|
||||
'notion.so': 'document',
|
||||
'medium.com': 'article',
|
||||
'substack.com': 'article',
|
||||
'youtube.com': 'video',
|
||||
'youtu.be': 'video',
|
||||
}
|
||||
|
||||
|
||||
def extract_claims(text):
|
||||
"""Extract verifiable claims from post text."""
|
||||
claims = []
|
||||
for pattern, claim_type in CLAIM_PATTERNS:
|
||||
matches = re.finditer(pattern, text, re.IGNORECASE)
|
||||
for m in matches:
|
||||
claims.append({
|
||||
'type': claim_type,
|
||||
'match': m.group(0),
|
||||
'span': [m.start(), m.end()],
|
||||
})
|
||||
return claims
|
||||
|
||||
|
||||
def classify_links(links):
|
||||
"""Classify links by investigatable domain."""
|
||||
classified = []
|
||||
for link in links:
|
||||
url = link.get('url', '')
|
||||
link_type = 'unknown'
|
||||
for domain, dtype in INVESTIGATABLE_DOMAINS.items():
|
||||
if domain in url:
|
||||
link_type = dtype
|
||||
break
|
||||
classified.append({
|
||||
'url': url,
|
||||
'text': link.get('text', ''),
|
||||
'type': link_type,
|
||||
})
|
||||
return classified
|
||||
|
||||
|
||||
def triage_post(post):
|
||||
"""Analyze a single post for investigation potential."""
|
||||
text = post.get('text', '') or ''
|
||||
|
||||
# Extract claims
|
||||
claims = extract_claims(text)
|
||||
|
||||
# Classify links
|
||||
all_links = (post.get('links', []) or [])
|
||||
card = post.get('card') or {}
|
||||
if card.get('url'):
|
||||
all_links.append({'url': card['url'], 'text': card.get('title', '')})
|
||||
classified_links = classify_links(all_links)
|
||||
investigatable_links = [l for l in classified_links if l['type'] != 'unknown']
|
||||
|
||||
# Has the post got something worth investigating?
|
||||
has_claims = len(claims) > 0
|
||||
has_links = len(investigatable_links) > 0
|
||||
has_any_links = len(classified_links) > 0
|
||||
|
||||
# Priority scoring
|
||||
priority = 0
|
||||
if has_claims:
|
||||
priority += 2
|
||||
# Performance claims are highest priority
|
||||
if any(c['type'] in ('performance_claim', 'multiplier_claim') for c in claims):
|
||||
priority += 2
|
||||
if any(c['type'] in ('copy_trading', 'arbitrage_opp') for c in claims):
|
||||
priority += 1
|
||||
if has_links:
|
||||
priority += 2
|
||||
if any(l['type'] == 'prediction_market' for l in investigatable_links):
|
||||
priority += 2
|
||||
if any(l['type'] in ('token_chart', 'blockchain') for l in investigatable_links):
|
||||
priority += 1
|
||||
if has_any_links and not has_links:
|
||||
priority += 1
|
||||
|
||||
# Build investigation tasks
|
||||
tasks = []
|
||||
|
||||
for claim in claims:
|
||||
if claim['type'] == 'performance_claim':
|
||||
tasks.append({
|
||||
'action': 'verify_performance',
|
||||
'description': f"Verify claim: {claim['match']}",
|
||||
'method': 'Check linked profile/data for actual track record',
|
||||
})
|
||||
elif claim['type'] == 'copy_trading':
|
||||
tasks.append({
|
||||
'action': 'verify_trader',
|
||||
'description': f"Verify trader referenced: {claim['match']}",
|
||||
'method': 'Check trader profile, recent bets/trades, actual P&L',
|
||||
})
|
||||
elif claim['type'] == 'arbitrage_opp':
|
||||
tasks.append({
|
||||
'action': 'verify_arb',
|
||||
'description': f"Verify opportunity: {claim['match']}",
|
||||
'method': 'Check if spread/mismatch still exists, calculate actual risk',
|
||||
})
|
||||
elif claim['type'] in ('odds_claim', 'shares_price', 'prediction_market'):
|
||||
tasks.append({
|
||||
'action': 'check_market',
|
||||
'description': f"Check prediction market: {claim['match']}",
|
||||
'method': 'Verify current odds, volume, resolution criteria',
|
||||
})
|
||||
elif claim['type'] == 'price_target':
|
||||
tasks.append({
|
||||
'action': 'check_price',
|
||||
'description': f"Verify price claim: {claim['match']}",
|
||||
'method': 'Check current price vs target, chart pattern',
|
||||
})
|
||||
elif claim['type'] == 'airdrop':
|
||||
tasks.append({
|
||||
'action': 'verify_airdrop',
|
||||
'description': f"Check airdrop legitimacy: {claim['match']}",
|
||||
'method': 'Verify project, check for scam signals, confirm eligibility',
|
||||
})
|
||||
|
||||
for link in investigatable_links:
|
||||
tasks.append({
|
||||
'action': f'browse_{link["type"]}',
|
||||
'description': f'Follow and analyze: {link["url"]}',
|
||||
'url': link['url'],
|
||||
'method': f'Browse to {link["type"]} link, extract current data',
|
||||
})
|
||||
|
||||
# For unknown links that might be interesting
|
||||
for link in classified_links:
|
||||
if link['type'] == 'unknown' and link['url'].startswith('http') and 'x.com' not in link['url']:
|
||||
tasks.append({
|
||||
'action': 'browse_unknown',
|
||||
'description': f'Check external link: {link["url"]}',
|
||||
'url': link['url'],
|
||||
'method': 'Browse to link, determine if relevant',
|
||||
})
|
||||
|
||||
return {
|
||||
'author': post.get('author', {}),
|
||||
'text': text,
|
||||
'url': post.get('url', ''),
|
||||
'timestamp': post.get('timestamp'),
|
||||
'metrics': post.get('metrics', {}),
|
||||
'claims': claims,
|
||||
'links': classified_links,
|
||||
'investigatable_links': investigatable_links,
|
||||
'priority': priority,
|
||||
'tasks': tasks,
|
||||
'worth_investigating': priority >= 2,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Triage X posts for investigation")
|
||||
parser.add_argument("input", help="Path to posts.json")
|
||||
parser.add_argument("--output", help="Output file")
|
||||
parser.add_argument("--min-priority", type=int, default=2, help="Min priority to include")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input) as f:
|
||||
data = json.load(f)
|
||||
|
||||
posts = data.get('posts', [])
|
||||
print(f"Triaging {len(posts)} posts...")
|
||||
|
||||
triaged = []
|
||||
for post in posts:
|
||||
result = triage_post(post)
|
||||
triaged.append(result)
|
||||
|
||||
# Sort by priority
|
||||
triaged.sort(key=lambda x: x['priority'], reverse=True)
|
||||
|
||||
# Filter
|
||||
worth = [t for t in triaged if t['priority'] >= args.min_priority]
|
||||
|
||||
output = {
|
||||
'triaged_at': datetime.now(timezone.utc).isoformat(),
|
||||
'total_posts': len(posts),
|
||||
'worth_investigating': len(worth),
|
||||
'posts': triaged,
|
||||
'investigation_queue': worth,
|
||||
}
|
||||
|
||||
output_path = args.output or os.path.join(os.path.dirname(args.input), 'triage.json')
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== Triage Summary ===")
|
||||
print(f"Total: {len(posts)} | Worth investigating: {len(worth)}")
|
||||
|
||||
if worth:
|
||||
print(f"\n=== Investigation Queue ===")
|
||||
for t in worth:
|
||||
author = t['author'].get('handle', '?')
|
||||
claims = [c['type'] for c in t['claims']]
|
||||
links = [l['type'] for l in t['investigatable_links']]
|
||||
print(f"\n [{t['priority']}] {author}")
|
||||
print(f" {t['text'][:150]}...")
|
||||
if claims:
|
||||
print(f" Claims: {', '.join(claims)}")
|
||||
if links:
|
||||
print(f" Links: {', '.join(links)}")
|
||||
print(f" Tasks: {len(t['tasks'])}")
|
||||
else:
|
||||
print("\nNo posts met the investigation threshold.")
|
||||
|
||||
print(f"\nSaved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user