workspace/skills/deep-scraper/scripts/triage-posts.py

#!/usr/bin/env python3
"""
Triage scraped X posts — identify posts with verifiable claims and links.
Extracts structured "investigation tasks" for agent follow-up.

Usage:
    python3 triage-posts.py <posts.json> [--output triage.json]
"""

import argparse
import json
import re
import os
from datetime import datetime, timezone


# Patterns that suggest a verifiable claim
CLAIM_PATTERNS = [
    # Specific dollar amounts and profits
    (r'\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s*(profit|gained|made|earned|up|return)', 'dollar_profit'),
    (r'made\s+\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', 'dollar_profit'),
    (r'(\d{1,3}(?:,\d{3})*)\s*(k|K)\s*(profit|gained|made|earned|up|return)', 'k_profit'),
    (r'(\d+(?:\.\d+)?)\s*(million|M)\s+(profit|made|earned|up)', 'million_profit'),

    # Performance claims
    (r'(\d+[\d.]*)\s*%\s*(win|success|profit|return|accuracy|hit rate)', 'performance_claim'),
    (r'wins?\s+(\d+[\d.]*)\s*%', 'performance_claim'),
    (r'(\d+[\d.]*)\s*%\s+of the time', 'performance_claim'),
    (r'(\d+[\d.]*)x\s+(return|profit|gain|returns)', 'multiplier_claim'),
    (r'(\d+[\d.]*)\s*x\s+in\s+(\d+)\s+(day|week|month|year)s?', 'timeframe_multiplier'),

    # Trading track record claims
    (r'(\d+)\s+(win|profitable)\s+(trade|bet|position)s?\s+in\s+a\s+row', 'streak_claim'),
    (r'(\d+)\s+out\s+of\s+(\d+)\s+(trade|bet|call)s?\s+(right|correct|profitable)', 'ratio_claim'),
    (r'(\d+[\d.]*)\s*%\s+accuracy\s+(this|last)\s+(week|month|year)', 'accuracy_claim'),
    (r'portfolio\s+up\s+(\d+[\d.]*)\s*%', 'portfolio_performance'),

    # Copy/follow trading
    (r'copy(ing|cat)?\s+(trader|user|bet|position|strat)', 'copy_trading'),
    (r'follow\s+(this|my|their)\s+(trade|bet|position|strat)', 'copy_trading'),
    (r'mirror(ing)?\s+(trade|bet|position)', 'copy_trading'),
    (r'best\s+(trader|performer)\s+(on|in)\s+\w+', 'top_trader'),

    # Crypto/DeFi specific claims
    (r'(\d+[\d.]*)\s+(eth|btc|sol|bnb|ada|dot)\s+(profit|gained|made|up)', 'crypto_profit'),
    (r'yield\s+farming\s+.{0,30}\s*(\d+[\d.]*)\s*%', 'yield_claim'),
    (r'staking\s+.{0,30}\s*(\d+[\d.]*)\s*%\s*(apy|apr)', 'staking_yield'),
    (r'liquidity\s+provider?\s+.{0,30}\s*(\d+[\d.]*)\s*%', 'lp_yield'),
    (r'(\d+[\d.]*)x\s+(leverage|margin)', 'leverage_claim'),

    # NFT claims
    (r'(flip|sold|bought)\s+.{0,30}\s+for\s+(\d+[\d.]*)\s+(eth|sol)', 'nft_flip'),
    (r'(\d+[\d.]*)\s+(eth|sol)\s+floor', 'nft_floor'),
    (r'mint\s+(price|cost)\s+(\d+[\d.]*)', 'mint_price'),

    # Arbitrage/spread
    (r'(arb|arbitrage|spread|mismatch|mispriced)', 'arbitrage_opp'),
    (r'risk[\s-]?free', 'arbitrage_opp'),
    (r'guaranteed\s+(profit|return|money)', 'arbitrage_opp'),
    (r'(\d+[\d.]*)\s*%\s+guaranteed', 'guaranteed_return'),

    # Sports betting / Prediction markets
    (r'(polymarket|kalshi|manifold|prediction\s+market)', 'prediction_market'),
    (r'(odds|probability)\s+.{0,20}\s*(\d+[\d.]*)\s*%', 'odds_claim'),
    (r'(yes|no)\s+shares?\s+at\s+(\d+)', 'shares_price'),
    (r'betting\s+(\d+[\d.]*)\s*%\s+on', 'betting_confidence'),
    (r'(\d+[\d.]*)\s*%\s+chance\s+(of|that)', 'probability_estimate'),

    # Price/target claims
    (r'(target|pt|price target)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'price_target'),
    (r'(entry|buy)\s*(at|zone|point)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'entry_point'),
    (r'next\s+(resistance|support)\s+at\s+\$?(\d[\d,.]*)', 'technical_level'),
    (r'going\s+to\s+\$?(\d[\d,.]*)', 'price_prediction'),

    # Airdrop/free money
    (r'(airdrop|free\s+money|free\s+tokens?|claiming)', 'airdrop'),
    (r'(step\s+\d|how\s+to\s+(get|claim|earn))', 'howto'),
    (r'eligible\s+for\s+(\d+[\d.]*)\s+tokens?', 'airdrop_amount'),

    # Signal/alert claims
    (r'(signal|alert|call)\s+.{0,30}\s*(\d+[\d.]*)\s*%\s+(gain|profit)', 'signal_performance'),
    (r'last\s+(\d+)\s+(signal|call|alert)s?\s+all\s+(profit|green|won)', 'signal_streak'),
    (r'(\d+[\d.]*)\s*%\s+win\s+rate\s+on\s+(signal|call)s?', 'signal_winrate'),

    # Time-sensitive claims
    (r'(today|yesterday|this\s+week)\s+.{0,30}\s*(\d+[\d.]*)\s*%\s+(up|gain|profit)', 'recent_performance'),
    (r'(\d+[\d.]*)\s*%\s+in\s+(\d+)\s+(hour|day|week)s?', 'timeframe_performance'),
    (r'last\s+(\d+)\s+(day|week|month)s?\s+.{0,30}\s*(\d+[\d.]*)\s*%', 'period_performance'),

    # User/account references with performance
    (r'@\w+.*?(portfolio|track\s*record|history|performance)', 'user_reference'),
    (r'(this\s+(guy|trader|user|person|account))\s+.{0,30}(profit|win|return|made)', 'user_reference'),
    (r'trader\s+with\s+(\d+[\d.]*)\s*%\s+(accuracy|win\s+rate)', 'trader_stats'),
    (r'(\d+[\d.]*)\s*(million|M|k|K)\s+follower\s+trader', 'influencer_trader'),

    # Strategy claims
    (r'(strategy|method|system)\s+that\s+.{0,30}(\d+[\d.]*)\s*%', 'strategy_claim'),
    (r'using\s+.{0,30}\s+(\d+[\d.]*)\s*%\s+(success|win)', 'method_performance'),
    (r'secret\s+(strategy|method|formula)', 'secret_method'),

    # Options/derivatives
    (r'(call|put)\s+option\s+.{0,30}\s*(\d+[\d.]*)\s*%', 'options_claim'),
    (r'(\d+[\d.]*)\s*%\s+(otm|itm|out\s+of\s+the\s+money|in\s+the\s+money)', 'options_probability'),
    (r'delta\s+(\d+[\d.]*)', 'options_delta'),

    # Course/education monetization
    (r'course\s+that\s+.{0,30}(\d+[\d.]*)\s*(k|K|dollar)', 'course_earning'),
    (r'teaching\s+.{0,30}\s+(\d+[\d.]*)\s*(k|K)\s+student', 'teaching_claim'),
    (r'(\d+[\d.]*)\s*%\s+of\s+students\s+(profit|succeed)', 'student_success'),
]

# Link domains that are investigatable
INVESTIGATABLE_DOMAINS = {
    'polymarket.com': 'prediction_market',
    'kalshi.com': 'prediction_market',
    'manifold.markets': 'prediction_market',
    'dexscreener.com': 'token_chart',
    'dextools.io': 'token_chart',
    'birdeye.so': 'token_chart',
    'coingecko.com': 'token_info',
    'coinmarketcap.com': 'token_info',
    'tradingview.com': 'chart',
    'etherscan.io': 'blockchain',
    'solscan.io': 'blockchain',
    'basescan.org': 'blockchain',
    'github.com': 'code_repo',
    'docs.google.com': 'document',
    'notion.so': 'document',
    'medium.com': 'article',
    'substack.com': 'article',
    'youtube.com': 'video',
    'youtu.be': 'video',
}


def extract_claims(text):
    """Extract verifiable claims from post text."""
    claims = []
    for pattern, claim_type in CLAIM_PATTERNS:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for m in matches:
            claims.append({
                'type': claim_type,
                'match': m.group(0),
                'span': [m.start(), m.end()],
            })
    return claims


def classify_links(links):
    """Classify links by investigatable domain."""
    classified = []
    for link in links:
        url = link.get('url', '')
        link_type = 'unknown'
        for domain, dtype in INVESTIGATABLE_DOMAINS.items():
            if domain in url:
                link_type = dtype
                break
        classified.append({
            'url': url,
            'text': link.get('text', ''),
            'type': link_type,
        })
    return classified


def triage_post(post):
    """Analyze a single post for investigation potential."""
    text = post.get('text', '') or ''

    # Extract claims
    claims = extract_claims(text)

    # Classify links
    all_links = (post.get('links', []) or [])
    card = post.get('card') or {}
    if card.get('url'):
        all_links.append({'url': card['url'], 'text': card.get('title', '')})
    classified_links = classify_links(all_links)
    investigatable_links = [l for l in classified_links if l['type'] != 'unknown']

    # Has the post got something worth investigating?
    has_claims = len(claims) > 0
    has_links = len(investigatable_links) > 0
    has_any_links = len(classified_links) > 0

    # Priority scoring
    priority = 0
    if has_claims:
        priority += 2
        # Performance claims are highest priority
        if any(c['type'] in ('performance_claim', 'multiplier_claim') for c in claims):
            priority += 2
        if any(c['type'] in ('copy_trading', 'arbitrage_opp') for c in claims):
            priority += 1
    if has_links:
        priority += 2
        if any(l['type'] == 'prediction_market' for l in investigatable_links):
            priority += 2
        if any(l['type'] in ('token_chart', 'blockchain') for l in investigatable_links):
            priority += 1
    if has_any_links and not has_links:
        priority += 1

    # Build investigation tasks
    tasks = []

    for claim in claims:
        if claim['type'] == 'performance_claim':
            tasks.append({
                'action': 'verify_performance',
                'description': f"Verify claim: {claim['match']}",
                'method': 'Check linked profile/data for actual track record',
            })
        elif claim['type'] == 'copy_trading':
            tasks.append({
                'action': 'verify_trader',
                'description': f"Verify trader referenced: {claim['match']}",
                'method': 'Check trader profile, recent bets/trades, actual P&L',
            })
        elif claim['type'] == 'arbitrage_opp':
            tasks.append({
                'action': 'verify_arb',
                'description': f"Verify opportunity: {claim['match']}",
                'method': 'Check if spread/mismatch still exists, calculate actual risk',
            })
        elif claim['type'] in ('odds_claim', 'shares_price', 'prediction_market'):
            tasks.append({
                'action': 'check_market',
                'description': f"Check prediction market: {claim['match']}",
                'method': 'Verify current odds, volume, resolution criteria',
            })
        elif claim['type'] == 'price_target':
            tasks.append({
                'action': 'check_price',
                'description': f"Verify price claim: {claim['match']}",
                'method': 'Check current price vs target, chart pattern',
            })
        elif claim['type'] == 'airdrop':
            tasks.append({
                'action': 'verify_airdrop',
                'description': f"Check airdrop legitimacy: {claim['match']}",
                'method': 'Verify project, check for scam signals, confirm eligibility',
            })

    for link in investigatable_links:
        tasks.append({
            'action': f'browse_{link["type"]}',
            'description': f'Follow and analyze: {link["url"]}',
            'url': link['url'],
            'method': f'Browse to {link["type"]} link, extract current data',
        })

    # For unknown links that might be interesting
    for link in classified_links:
        if link['type'] == 'unknown' and link['url'].startswith('http') and 'x.com' not in link['url']:
            tasks.append({
                'action': 'browse_unknown',
                'description': f'Check external link: {link["url"]}',
                'url': link['url'],
                'method': 'Browse to link, determine if relevant',
            })

    return {
        'author': post.get('author', {}),
        'text': text,
        'url': post.get('url', ''),
        'timestamp': post.get('timestamp'),
        'metrics': post.get('metrics', {}),
        'claims': claims,
        'links': classified_links,
        'investigatable_links': investigatable_links,
        'priority': priority,
        'tasks': tasks,
        'worth_investigating': priority >= 2,
    }


def main():
    parser = argparse.ArgumentParser(description="Triage X posts for investigation")
    parser.add_argument("input", help="Path to posts.json")
    parser.add_argument("--output", help="Output file")
    parser.add_argument("--min-priority", type=int, default=2, help="Min priority to include")
    args = parser.parse_args()

    with open(args.input) as f:
        data = json.load(f)

    posts = data.get('posts', [])
    print(f"Triaging {len(posts)} posts...")

    triaged = []
    for post in posts:
        result = triage_post(post)
        triaged.append(result)

    # Sort by priority
    triaged.sort(key=lambda x: x['priority'], reverse=True)

    # Filter
    worth = [t for t in triaged if t['priority'] >= args.min_priority]

    output = {
        'triaged_at': datetime.now(timezone.utc).isoformat(),
        'total_posts': len(posts),
        'worth_investigating': len(worth),
        'posts': triaged,
        'investigation_queue': worth,
    }

    output_path = args.output or os.path.join(os.path.dirname(args.input), 'triage.json')
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=2)

    # Print summary
    print(f"\n=== Triage Summary ===")
    print(f"Total: {len(posts)} | Worth investigating: {len(worth)}")

    if worth:
        print(f"\n=== Investigation Queue ===")
        for t in worth:
            author = t['author'].get('handle', '?')
            claims = [c['type'] for c in t['claims']]
            links = [l['type'] for l in t['investigatable_links']]
            print(f"\n  [{t['priority']}] {author}")
            print(f"    {t['text'][:150]}...")
            if claims:
                print(f"    Claims: {', '.join(claims)}")
            if links:
                print(f"    Links: {', '.join(links)}")
            print(f"    Tasks: {len(t['tasks'])}")
    else:
        print("\nNo posts met the investigation threshold.")

    print(f"\nSaved to {output_path}")


if __name__ == "__main__":
    main()