Feed Hunter: deep scraper skill, pipeline, simulator, first investigation

- Built deep-scraper skill (CDP-based X feed extraction) - Three-stage pipeline: scrape → triage → investigate - Paper trading simulator with position tracking - First live investigation: verified kch123 Polymarket profile ($9.3M P&L) - Opened first paper position: Seahawks Super Bowl @ 68c - Telegram alerts with inline action buttons - Portal build in progress (night shift)
2026-02-07 23:58:40 -06:00
parent b93228ddc2
commit 8638500190
31 changed files with 7752 additions and 40 deletions
--- a/skills/deep-scraper/scripts/analyze-posts.py
+++ b/skills/deep-scraper/scripts/analyze-posts.py
@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""
+Analyze scraped X/Twitter posts for money-making signals.
+Reads posts.json, classifies and scores each post.
+
+Usage:
+    python3 analyze-posts.py <path-to-posts.json> [--output analysis.json]
+"""
+
+import argparse
+import json
+import re
+import sys
+from datetime import datetime, timezone
+
+
+# Category keywords/patterns
+CATEGORIES = {
+    "crypto": {
+        "keywords": ["bitcoin", "btc", "ethereum", "eth", "solana", "sol", "crypto",
+                      "token", "defi", "dex", "nft", "airdrop", "memecoin", "altcoin",
+                      "bullish", "bearish", "pump", "dump", "moon", "hodl", "whale",
+                      "binance", "coinbase", "degen", "rug", "mint", "chain",
+                      "staking", "yield", "liquidity", "swap", "bridge"],
+        "weight": 1.0
+    },
+    "polymarket": {
+        "keywords": ["polymarket", "prediction market", "kalshi", "manifold",
+                      "betting market", "odds", "probability", "yes/no",
+                      "shares", "contract"],
+        "weight": 1.0
+    },
+    "arbitrage": {
+        "keywords": ["arbitrage", "arb", "spread", "price difference",
+                      "cross-exchange", "risk-free", "guaranteed profit",
+                      "mismatch", "exploit"],
+        "weight": 1.0
+    },
+    "trading": {
+        "keywords": ["long", "short", "leverage", "margin", "futures",
+                      "options", "calls", "puts", "entry", "exit", "target",
+                      "stop loss", "take profit", "chart", "technical analysis",
+                      "support", "resistance", "breakout", "reversal"],
+        "weight": 0.8
+    },
+    "money_opportunity": {
+        "keywords": ["free money", "easy money", "passive income", "side hustle",
+                      "make money", "earn", "profit", "roi", "returns",
+                      "alpha", "signal", "opportunity", "undervalued"],
+        "weight": 0.7
+    }
+}
+
+# Spam/scam signals
+SPAM_SIGNALS = {
+    "patterns": [
+        r"dm me", r"link in bio", r"join my", r"guaranteed \d+%",
+        r"100x", r"1000x", r"send .* to receive",
+        r"whitelist", r"presale", r"limited spots",
+        r"act now", r"don't miss", r"last chance",
+        r"🚀{3,}", r"💰{3,}", r"🔥{3,}",
+        r"follow.*retweet.*like", r"giveaway",
+        r"drop.*wallet", r"reply.*address"
+    ],
+    "weight": -1.0
+}
+
+# Time sensitivity signals
+TIME_SENSITIVE = [
+    r"ending (soon|today|tonight|in \d+)",
+    r"last \d+ (hour|minute|day)",
+    r"expires? (today|tonight|soon|in)",
+    r"deadline",
+    r"closing (soon|in)",
+    r"only \d+ (left|remaining|spots)",
+    r"window closing",
+    r"before .* (ends|closes|expires)"
+]
+
+
+def classify_post(post):
+    """Classify a single post and return analysis."""
+    text = ((post.get("text") or "") + " " + 
+            ((post.get("card") or {}).get("title") or "") + " " +
+            ((post.get("card") or {}).get("description") or "")).lower()
+    
+    # Category detection
+    categories = {}
+    for cat_name, cat_info in CATEGORIES.items():
+        matches = [kw for kw in cat_info["keywords"] if kw in text]
+        if matches:
+            categories[cat_name] = {
+                "matched": matches,
+                "score": min(len(matches) * cat_info["weight"] * 0.2, 1.0)
+            }
+    
+    # Spam detection
+    spam_matches = []
+    for pattern in SPAM_SIGNALS["patterns"]:
+        if re.search(pattern, text, re.IGNORECASE):
+            spam_matches.append(pattern)
+    spam_score = min(len(spam_matches) * 0.25, 1.0)
+    
+    # Time sensitivity
+    time_sensitive = False
+    time_matches = []
+    for pattern in TIME_SENSITIVE:
+        m = re.search(pattern, text, re.IGNORECASE)
+        if m:
+            time_sensitive = True
+            time_matches.append(m.group(0))
+    
+    # Engagement quality (high engagement = more likely legit)
+    metrics = post.get("metrics", {})
+    engagement_score = 0
+    try:
+        likes = int(str(metrics.get("likes", "0")).replace(",", ""))
+        reposts = int(str(metrics.get("reposts", "0")).replace(",", ""))
+        views = int(str(metrics.get("views", "0")).replace(",", ""))
+        if views > 0:
+            engagement_rate = (likes + reposts) / views
+            engagement_score = min(engagement_rate * 100, 1.0)
+    except (ValueError, ZeroDivisionError):
+        pass
+    
+    # Has external links (higher value for analysis)
+    external_links = [l for l in post.get("links", []) 
+                      if l.get("url", "").startswith("http") and "x.com" not in l.get("url", "")]
+    
+    # Overall signal score
+    category_score = max((c["score"] for c in categories.values()), default=0)
+    signal_score = max(0, min(1.0,
+        category_score * 0.4 +
+        engagement_score * 0.2 +
+        (0.1 if external_links else 0) +
+        (0.1 if time_sensitive else 0) -
+        spam_score * 0.3
+    ))
+    
+    # Verdict
+    if spam_score > 0.5:
+        verdict = "likely_spam"
+    elif signal_score > 0.5 and categories:
+        verdict = "high_signal"
+    elif signal_score > 0.25 and categories:
+        verdict = "medium_signal"
+    elif categories:
+        verdict = "low_signal"
+    else:
+        verdict = "noise"
+    
+    return {
+        "author": post.get("author", {}),
+        "text_preview": post.get("text", "")[:200],
+        "url": post.get("url", ""),
+        "categories": categories,
+        "spam_score": round(spam_score, 2),
+        "spam_matches": spam_matches,
+        "time_sensitive": time_sensitive,
+        "time_matches": time_matches,
+        "engagement_score": round(engagement_score, 2),
+        "external_links": external_links,
+        "signal_score": round(signal_score, 2),
+        "verdict": verdict,
+        "timestamp": post.get("timestamp"),
+        "metrics": metrics
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze X feed posts")
+    parser.add_argument("input", help="Path to posts.json")
+    parser.add_argument("--output", help="Output file (default: analysis.json in same dir)")
+    parser.add_argument("--min-signal", type=float, default=0.0, help="Min signal score to include")
+    args = parser.parse_args()
+    
+    with open(args.input) as f:
+        data = json.load(f)
+    
+    posts = data.get("posts", [])
+    print(f"Analyzing {len(posts)} posts...")
+    
+    analyses = []
+    for post in posts:
+        analysis = classify_post(post)
+        if analysis["signal_score"] >= args.min_signal:
+            analyses.append(analysis)
+    
+    # Sort by signal score descending
+    analyses.sort(key=lambda x: x["signal_score"], reverse=True)
+    
+    # Stats
+    verdicts = {}
+    for a in analyses:
+        v = a["verdict"]
+        verdicts[v] = verdicts.get(v, 0) + 1
+    
+    result = {
+        "analyzed_at": datetime.now(timezone.utc).isoformat(),
+        "total_posts": len(posts),
+        "analyzed_posts": len(analyses),
+        "verdicts": verdicts,
+        "posts": analyses
+    }
+    
+    # Output
+    output_path = args.output
+    if not output_path:
+        import os
+        output_path = os.path.join(os.path.dirname(args.input), "analysis.json")
+    
+    with open(output_path, "w") as f:
+        json.dump(result, f, indent=2)
+    
+    # Print summary
+    print(f"\n=== Analysis Summary ===")
+    print(f"Total posts: {len(posts)}")
+    for verdict, count in sorted(verdicts.items()):
+        emoji = {"high_signal": "🟢", "medium_signal": "🟡", "low_signal": "⚪", 
+                 "likely_spam": "🔴", "noise": "⚫"}.get(verdict, "❓")
+        print(f"  {emoji} {verdict}: {count}")
+    
+    # Show top signals
+    high = [a for a in analyses if a["verdict"] in ("high_signal", "medium_signal")]
+    if high:
+        print(f"\n=== Top Signals ===")
+        for a in high[:10]:
+            cats = ", ".join(a["categories"].keys())
+            ts = "⏰" if a["time_sensitive"] else ""
+            print(f"  [{a['signal_score']:.2f}] {a['author'].get('handle', '?')} — {cats} {ts}")
+            print(f"    {a['text_preview'][:100]}...")
+    
+    print(f"\nSaved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/deep-scraper/scripts/launch-chrome-debug.sh
+++ b/skills/deep-scraper/scripts/launch-chrome-debug.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+# Launch Chrome with remote debugging for deep scraping
+# Uses a copy of the user's profile to enable debug port
+
+PROFILE_SRC="/home/wdjones/.config/google-chrome"
+PROFILE_DBG="/home/wdjones/.config/google-chrome-debug"
+PORT=${1:-9222}
+
+# Kill any existing debug chrome
+pkill -f "chrome-debug" 2>/dev/null
+
+# Create/sync debug profile
+if [ ! -d "$PROFILE_DBG" ]; then
+    echo "Creating debug profile (first time, this takes a moment)..."
+    cp -r "$PROFILE_SRC" "$PROFILE_DBG"
+else
+    # Sync cookies and local storage
+    cp "$PROFILE_SRC/Default/Cookies" "$PROFILE_DBG/Default/Cookies" 2>/dev/null
+    cp -r "$PROFILE_SRC/Default/Local Storage" "$PROFILE_DBG/Default/Local Storage" 2>/dev/null
+fi
+
+rm -f "$PROFILE_DBG/SingletonLock" "$PROFILE_DBG/SingletonSocket" "$PROFILE_DBG/SingletonCookie" 2>/dev/null
+
+DISPLAY=:0 /usr/bin/google-chrome-stable --no-sandbox \
+    --user-data-dir="$PROFILE_DBG" \
+    --remote-debugging-port=$PORT \
+    --remote-allow-origins=* \
+    https://x.com/home &>/dev/null &
+
+echo "Chrome launched (PID $!, debug port $PORT)"
+echo "Waiting for port..."
+
+for i in $(seq 1 15); do
+    if curl -s "http://127.0.0.1:$PORT/json" >/dev/null 2>&1; then
+        echo "Ready!"
+        exit 0
+    fi
+    sleep 1
+done
+
+echo "ERROR: Port $PORT not ready after 15s"
+exit 1
--- a/skills/deep-scraper/scripts/scrape-x-feed.py
+++ b/skills/deep-scraper/scripts/scrape-x-feed.py
@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+"""
+Deep scraper for X/Twitter feed — extracts structured post data via CDP.
+Connects to an existing Chrome instance with remote debugging enabled.
+
+Usage:
+    python3 scrape-x-feed.py [--port 9222] [--scroll-pages 5] [--output DIR]
+
+Requires: Chrome running with --remote-debugging-port=9222
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+import subprocess
+import urllib.request
+from datetime import datetime
+
+import websocket
+
+
+def cdp_send(ws, method, params=None, msg_id=[0]):
+    """Send CDP command over an open WebSocket connection."""
+    msg_id[0] += 1
+    mid = msg_id[0]
+    msg = {"id": mid, "method": method, "params": params or {}}
+    ws.settimeout(15)
+    ws.send(json.dumps(msg))
+    for _ in range(100):  # max 100 events before giving up
+        try:
+            resp = json.loads(ws.recv())
+        except Exception:
+            return {"error": "timeout"}
+        if resp.get("id") == mid:
+            return resp
+        # Skip CDP events
+    return {"error": "too many events"}
+
+
+def get_ws_url(port):
+    """Get WebSocket debugger URL from Chrome DevTools."""
+    url = f"http://127.0.0.1:{port}/json"
+    resp = urllib.request.urlopen(url, timeout=5)
+    tabs = json.loads(resp.read())
+    for tab in tabs:
+        if "x.com" in tab.get("url", ""):
+            return tab["webSocketDebuggerUrl"]
+    # Fallback to first tab
+    if tabs:
+        return tabs[0]["webSocketDebuggerUrl"]
+    raise RuntimeError("No Chrome tabs found")
+
+
+# JavaScript to extract tweets from the DOM
+EXTRACT_JS = r"""
+(() => {
+    const posts = [];
+    const seen = new Set();
+    
+    // X/Twitter uses article elements for tweets
+    const articles = document.querySelectorAll('article[data-testid="tweet"]');
+    
+    for (const article of articles) {
+        try {
+            // Author info
+            const userLinks = article.querySelectorAll('a[role="link"]');
+            let displayName = '';
+            let handle = '';
+            for (const link of userLinks) {
+                const href = link.getAttribute('href') || '';
+                if (href.match(/^\/[a-zA-Z0-9_]+$/) && !href.includes('/status/')) {
+                    if (!handle) handle = href.replace('/', '@');
+                    const nameEl = link.querySelector('span');
+                    if (nameEl && !displayName) displayName = nameEl.textContent.trim();
+                }
+            }
+            
+            // Tweet text
+            const textEl = article.querySelector('[data-testid="tweetText"]');
+            const text = textEl ? textEl.textContent.trim() : '';
+            
+            // Skip if we've seen this exact text+author combo
+            const key = `${handle}:${text.slice(0, 50)}`;
+            if (seen.has(key)) continue;
+            seen.add(key);
+            
+            // Timestamp
+            const timeEl = article.querySelector('time');
+            const timestamp = timeEl ? timeEl.getAttribute('datetime') : null;
+            const timeText = timeEl ? timeEl.textContent.trim() : '';
+            
+            // Link to tweet
+            let tweetUrl = '';
+            const statusLinks = article.querySelectorAll('a[href*="/status/"]');
+            for (const sl of statusLinks) {
+                const href = sl.getAttribute('href') || '';
+                if (href.match(/\/status\/\d+$/)) {
+                    tweetUrl = 'https://x.com' + href;
+                    break;
+                }
+            }
+            
+            // Engagement metrics
+            const metrics = {};
+            const groups = article.querySelectorAll('[role="group"]');
+            for (const group of groups) {
+                const buttons = group.querySelectorAll('button');
+                for (const btn of buttons) {
+                    const label = btn.getAttribute('aria-label') || '';
+                    if (label.includes('repl')) {
+                        const m = label.match(/(\d[\d,.]*)/);
+                        if (m) metrics.replies = m[1];
+                    } else if (label.includes('repost') || label.includes('Repost')) {
+                        const m = label.match(/(\d[\d,.]*)/);
+                        if (m) metrics.reposts = m[1];
+                    } else if (label.includes('like') || label.includes('Like')) {
+                        const m = label.match(/(\d[\d,.]*)/);
+                        if (m) metrics.likes = m[1];
+                    } else if (label.includes('view') || label.includes('View')) {
+                        const m = label.match(/(\d[\d,.]*)/);
+                        if (m) metrics.views = m[1];
+                    } else if (label.includes('bookmark') || label.includes('Bookmark')) {
+                        const m = label.match(/(\d[\d,.]*)/);
+                        if (m) metrics.bookmarks = m[1];
+                    }
+                }
+            }
+            
+            // Embedded links
+            const links = [];
+            if (textEl) {
+                const anchors = textEl.querySelectorAll('a');
+                for (const a of anchors) {
+                    const href = a.getAttribute('href') || '';
+                    const linkText = a.textContent.trim();
+                    if (href && !href.startsWith('/')) {
+                        links.push({ url: href, text: linkText });
+                    } else if (href.startsWith('/')) {
+                        links.push({ url: 'https://x.com' + href, text: linkText });
+                    }
+                }
+            }
+            
+            // Media (images/video indicators)
+            const media = [];
+            const imgs = article.querySelectorAll('[data-testid="tweetPhoto"] img');
+            for (const img of imgs) {
+                media.push({ type: 'image', src: img.src });
+            }
+            const videoEl = article.querySelector('[data-testid="videoPlayer"]');
+            if (videoEl) media.push({ type: 'video' });
+            
+            // Card/preview
+            const card = article.querySelector('[data-testid="card.wrapper"]');
+            let cardData = null;
+            if (card) {
+                const cardTitle = card.querySelector('[data-testid="card.layoutLarge.title"], [data-testid="card.layoutSmall.title"]');
+                const cardDesc = card.querySelector('[data-testid="card.layoutLarge.description"], [data-testid="card.layoutSmall.description"]');
+                const cardLink = card.querySelector('a');
+                cardData = {
+                    title: cardTitle ? cardTitle.textContent.trim() : null,
+                    description: cardDesc ? cardDesc.textContent.trim() : null,
+                    url: cardLink ? cardLink.getAttribute('href') : null
+                };
+            }
+            
+            // Is it a repost?
+            const socialContext = article.querySelector('[data-testid="socialContext"]');
+            let repostBy = null;
+            if (socialContext && socialContext.textContent.includes('reposted')) {
+                repostBy = socialContext.textContent.replace(' reposted', '').trim();
+            }
+            
+            posts.push({
+                author: { displayName, handle },
+                text,
+                timestamp,
+                timeText,
+                url: tweetUrl,
+                metrics,
+                links,
+                media,
+                card: cardData,
+                repostBy
+            });
+        } catch (e) {
+            // Skip malformed tweets
+        }
+    }
+    
+    return JSON.stringify(posts);
+})()
+"""
+
+
+def scrape_via_cdp(port, scroll_pages, output_dir):
+    """Scrape X feed using Chrome DevTools Protocol."""
+    ws_url = get_ws_url(port)
+    print(f"Connecting to: {ws_url}")
+    ws = websocket.create_connection(ws_url, timeout=30)
+    print("Connected!")
+    
+    all_posts = []
+    seen_keys = set()
+    
+    for page in range(scroll_pages):
+        print(f"Scraping page {page + 1}/{scroll_pages}...")
+        
+        # Execute extraction JS
+        result = cdp_send(ws, "Runtime.evaluate", {
+            "expression": EXTRACT_JS,
+            "returnByValue": True
+        })
+        
+        value = result.get("result", {}).get("result", {}).get("value", "[]")
+        posts = json.loads(value) if isinstance(value, str) else []
+        
+        # Deduplicate
+        new = 0
+        for post in posts:
+            key = f"{post['author']['handle']}:{post['text'][:80]}"
+            if key not in seen_keys:
+                seen_keys.add(key)
+                all_posts.append(post)
+                new += 1
+        
+        print(f"  Found {len(posts)} posts ({new} new)")
+        
+        if page < scroll_pages - 1:
+            # Scroll down
+            cdp_send(ws, "Runtime.evaluate", {
+                "expression": "window.scrollBy(0, window.innerHeight * 2)"
+            })
+            time.sleep(3)
+    
+    ws.close()
+    
+    # Save output
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    run_dir = os.path.join(output_dir, timestamp)
+    os.makedirs(run_dir, exist_ok=True)
+    
+    output_file = os.path.join(run_dir, "posts.json")
+    with open(output_file, "w") as f:
+        json.dump({
+            "timestamp": timestamp,
+            "total_posts": len(all_posts),
+            "posts": all_posts
+        }, f, indent=2)
+    
+    # Also save a human-readable summary
+    summary_file = os.path.join(run_dir, "summary.md")
+    with open(summary_file, "w") as f:
+        f.write(f"# X Feed Scrape — {timestamp}\n\n")
+        f.write(f"**Total posts:** {len(all_posts)}\n\n")
+        for i, post in enumerate(all_posts, 1):
+            author = post['author']
+            f.write(f"## {i}. {author['displayName']} ({author['handle']})\n")
+            if post.get('repostBy'):
+                f.write(f"*Reposted by {post['repostBy']}*\n")
+            f.write(f"\n{post['text']}\n\n")
+            if post.get('metrics'):
+                m = post['metrics']
+                parts = []
+                for k, v in m.items():
+                    parts.append(f"{k}: {v}")
+                f.write(f"📊 {' | '.join(parts)}\n")
+            if post.get('links'):
+                f.write(f"\n🔗 Links:\n")
+                for link in post['links']:
+                    f.write(f"  - [{link['text']}]({link['url']})\n")
+            if post.get('card'):
+                c = post['card']
+                f.write(f"\n📎 Card: {c.get('title', 'N/A')}\n")
+                if c.get('description'):
+                    f.write(f"   {c['description']}\n")
+            if post.get('url'):
+                f.write(f"\n🔗 {post['url']}\n")
+            f.write(f"\n---\n\n")
+    
+    print(f"\nDone! {len(all_posts)} posts saved to {run_dir}/")
+    print(f"  posts.json  — structured data")
+    print(f"  summary.md  — human-readable")
+    return run_dir
+
+
+def scrape_via_xdotool(scroll_pages, output_dir):
+    """Fallback: use xdotool + JS injection via xdg approach. 
+    This launches Chrome with remote debugging if not already running."""
+    
+    # Check if Chrome is running with debugging port
+    try:
+        urllib.request.urlopen("http://127.0.0.1:9222/json", timeout=2)
+        return scrape_via_cdp(9222, scroll_pages, output_dir)
+    except Exception:
+        pass
+    
+    # Launch Chrome with remote debugging
+    print("Launching Chrome with remote debugging...")
+    subprocess.Popen([
+        "/usr/bin/google-chrome-stable",
+        "--no-sandbox",
+        "--user-data-dir=/home/wdjones/.config/google-chrome",
+        "--remote-debugging-port=9222",
+        "https://x.com/home"
+    ], env={**os.environ, "DISPLAY": ":0"}, 
+       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    
+    # Wait for Chrome to be ready
+    for _ in range(20):
+        try:
+            urllib.request.urlopen("http://127.0.0.1:9222/json", timeout=2)
+            break
+        except Exception:
+            time.sleep(1)
+    else:
+        print("ERROR: Chrome didn't start with debugging port", file=sys.stderr)
+        sys.exit(1)
+    
+    time.sleep(5)  # Let page load
+    return scrape_via_cdp(9222, scroll_pages, output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Deep scrape X/Twitter feed")
+    parser.add_argument("--port", type=int, default=9222, help="Chrome debugging port")
+    parser.add_argument("--scroll-pages", type=int, default=5, help="Number of scroll pages")
+    parser.add_argument("--output", default="/home/wdjones/.openclaw/workspace/data/x-feed",
+                        help="Output directory")
+    parser.add_argument("--launch", action="store_true", help="Launch Chrome if not running")
+    args = parser.parse_args()
+    
+    os.makedirs(args.output, exist_ok=True)
+    
+    if args.launch:
+        scrape_via_xdotool(args.scroll_pages, args.output)
+    else:
+        scrape_via_cdp(args.port, args.scroll_pages, args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/deep-scraper/scripts/triage-posts.py
+++ b/skills/deep-scraper/scripts/triage-posts.py
@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Triage scraped X posts — identify posts with verifiable claims and links.
+Extracts structured "investigation tasks" for agent follow-up.
+
+Usage:
+    python3 triage-posts.py <posts.json> [--output triage.json]
+"""
+
+import argparse
+import json
+import re
+import os
+from datetime import datetime, timezone
+
+
+# Patterns that suggest a verifiable claim
+CLAIM_PATTERNS = [
+    # Performance claims
+    (r'(\d+[\d.]*)\s*%\s*(win|success|profit|return|accuracy|hit rate)', 'performance_claim'),
+    (r'wins?\s+(\d+[\d.]*)\s*%', 'performance_claim'),
+    (r'(\d+[\d.]*)\s*%\s+of the time', 'performance_claim'),
+    (r'(\d+[\d.]*)x\s+(return|profit|gain)', 'multiplier_claim'),
+    
+    # Copy/follow trading
+    (r'copy(ing|cat)?\s+(trader|user|bet|position|strat)', 'copy_trading'),
+    (r'follow\s+(this|my|their)\s+(trade|bet|position|strat)', 'copy_trading'),
+    (r'mirror(ing)?\s+(trade|bet|position)', 'copy_trading'),
+    
+    # Arbitrage/spread
+    (r'(arb|arbitrage|spread|mismatch|mispriced)', 'arbitrage_opp'),
+    (r'risk[\s-]?free', 'arbitrage_opp'),
+    (r'guaranteed\s+(profit|return|money)', 'arbitrage_opp'),
+    
+    # Prediction/betting
+    (r'(polymarket|kalshi|manifold|prediction\s+market)', 'prediction_market'),
+    (r'(odds|probability)\s+.{0,20}\s*(\d+[\d.]*)\s*%', 'odds_claim'),
+    (r'(yes|no)\s+shares?\s+at\s+(\d+)', 'shares_price'),
+    
+    # Price/target claims
+    (r'(target|pt|price target)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'price_target'),
+    (r'(entry|buy)\s*(at|zone|point)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'entry_point'),
+    
+    # Airdrop/free money
+    (r'(airdrop|free\s+money|free\s+tokens?|claiming)', 'airdrop'),
+    (r'(step\s+\d|how\s+to\s+(get|claim|earn))', 'howto'),
+    
+    # User/account references
+    (r'@\w+.*?(portfolio|track\s*record|history|performance)', 'user_reference'),
+    (r'(this\s+(guy|trader|user|person|account))\s+.{0,30}(profit|win|return|made)', 'user_reference'),
+]
+
+# Link domains that are investigatable
+INVESTIGATABLE_DOMAINS = {
+    'polymarket.com': 'prediction_market',
+    'kalshi.com': 'prediction_market',
+    'manifold.markets': 'prediction_market',
+    'dexscreener.com': 'token_chart',
+    'dextools.io': 'token_chart',
+    'birdeye.so': 'token_chart',
+    'coingecko.com': 'token_info',
+    'coinmarketcap.com': 'token_info',
+    'tradingview.com': 'chart',
+    'etherscan.io': 'blockchain',
+    'solscan.io': 'blockchain',
+    'basescan.org': 'blockchain',
+    'github.com': 'code_repo',
+    'docs.google.com': 'document',
+    'notion.so': 'document',
+    'medium.com': 'article',
+    'substack.com': 'article',
+    'youtube.com': 'video',
+    'youtu.be': 'video',
+}
+
+
+def extract_claims(text):
+    """Extract verifiable claims from post text."""
+    claims = []
+    for pattern, claim_type in CLAIM_PATTERNS:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        for m in matches:
+            claims.append({
+                'type': claim_type,
+                'match': m.group(0),
+                'span': [m.start(), m.end()],
+            })
+    return claims
+
+
+def classify_links(links):
+    """Classify links by investigatable domain."""
+    classified = []
+    for link in links:
+        url = link.get('url', '')
+        link_type = 'unknown'
+        for domain, dtype in INVESTIGATABLE_DOMAINS.items():
+            if domain in url:
+                link_type = dtype
+                break
+        classified.append({
+            'url': url,
+            'text': link.get('text', ''),
+            'type': link_type,
+        })
+    return classified
+
+
+def triage_post(post):
+    """Analyze a single post for investigation potential."""
+    text = post.get('text', '') or ''
+    
+    # Extract claims
+    claims = extract_claims(text)
+    
+    # Classify links
+    all_links = (post.get('links', []) or [])
+    card = post.get('card') or {}
+    if card.get('url'):
+        all_links.append({'url': card['url'], 'text': card.get('title', '')})
+    classified_links = classify_links(all_links)
+    investigatable_links = [l for l in classified_links if l['type'] != 'unknown']
+    
+    # Has the post got something worth investigating?
+    has_claims = len(claims) > 0
+    has_links = len(investigatable_links) > 0
+    has_any_links = len(classified_links) > 0
+    
+    # Priority scoring
+    priority = 0
+    if has_claims:
+        priority += 2
+        # Performance claims are highest priority
+        if any(c['type'] in ('performance_claim', 'multiplier_claim') for c in claims):
+            priority += 2
+        if any(c['type'] in ('copy_trading', 'arbitrage_opp') for c in claims):
+            priority += 1
+    if has_links:
+        priority += 2
+        if any(l['type'] == 'prediction_market' for l in investigatable_links):
+            priority += 2
+        if any(l['type'] in ('token_chart', 'blockchain') for l in investigatable_links):
+            priority += 1
+    if has_any_links and not has_links:
+        priority += 1
+    
+    # Build investigation tasks
+    tasks = []
+    
+    for claim in claims:
+        if claim['type'] == 'performance_claim':
+            tasks.append({
+                'action': 'verify_performance',
+                'description': f"Verify claim: {claim['match']}",
+                'method': 'Check linked profile/data for actual track record',
+            })
+        elif claim['type'] == 'copy_trading':
+            tasks.append({
+                'action': 'verify_trader',
+                'description': f"Verify trader referenced: {claim['match']}",
+                'method': 'Check trader profile, recent bets/trades, actual P&L',
+            })
+        elif claim['type'] == 'arbitrage_opp':
+            tasks.append({
+                'action': 'verify_arb',
+                'description': f"Verify opportunity: {claim['match']}",
+                'method': 'Check if spread/mismatch still exists, calculate actual risk',
+            })
+        elif claim['type'] in ('odds_claim', 'shares_price', 'prediction_market'):
+            tasks.append({
+                'action': 'check_market',
+                'description': f"Check prediction market: {claim['match']}",
+                'method': 'Verify current odds, volume, resolution criteria',
+            })
+        elif claim['type'] == 'price_target':
+            tasks.append({
+                'action': 'check_price',
+                'description': f"Verify price claim: {claim['match']}",
+                'method': 'Check current price vs target, chart pattern',
+            })
+        elif claim['type'] == 'airdrop':
+            tasks.append({
+                'action': 'verify_airdrop',
+                'description': f"Check airdrop legitimacy: {claim['match']}",
+                'method': 'Verify project, check for scam signals, confirm eligibility',
+            })
+    
+    for link in investigatable_links:
+        tasks.append({
+            'action': f'browse_{link["type"]}',
+            'description': f'Follow and analyze: {link["url"]}',
+            'url': link['url'],
+            'method': f'Browse to {link["type"]} link, extract current data',
+        })
+    
+    # For unknown links that might be interesting
+    for link in classified_links:
+        if link['type'] == 'unknown' and link['url'].startswith('http') and 'x.com' not in link['url']:
+            tasks.append({
+                'action': 'browse_unknown',
+                'description': f'Check external link: {link["url"]}',
+                'url': link['url'],
+                'method': 'Browse to link, determine if relevant',
+            })
+    
+    return {
+        'author': post.get('author', {}),
+        'text': text,
+        'url': post.get('url', ''),
+        'timestamp': post.get('timestamp'),
+        'metrics': post.get('metrics', {}),
+        'claims': claims,
+        'links': classified_links,
+        'investigatable_links': investigatable_links,
+        'priority': priority,
+        'tasks': tasks,
+        'worth_investigating': priority >= 2,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Triage X posts for investigation")
+    parser.add_argument("input", help="Path to posts.json")
+    parser.add_argument("--output", help="Output file")
+    parser.add_argument("--min-priority", type=int, default=2, help="Min priority to include")
+    args = parser.parse_args()
+    
+    with open(args.input) as f:
+        data = json.load(f)
+    
+    posts = data.get('posts', [])
+    print(f"Triaging {len(posts)} posts...")
+    
+    triaged = []
+    for post in posts:
+        result = triage_post(post)
+        triaged.append(result)
+    
+    # Sort by priority
+    triaged.sort(key=lambda x: x['priority'], reverse=True)
+    
+    # Filter
+    worth = [t for t in triaged if t['priority'] >= args.min_priority]
+    
+    output = {
+        'triaged_at': datetime.now(timezone.utc).isoformat(),
+        'total_posts': len(posts),
+        'worth_investigating': len(worth),
+        'posts': triaged,
+        'investigation_queue': worth,
+    }
+    
+    output_path = args.output or os.path.join(os.path.dirname(args.input), 'triage.json')
+    with open(output_path, 'w') as f:
+        json.dump(output, f, indent=2)
+    
+    # Print summary
+    print(f"\n=== Triage Summary ===")
+    print(f"Total: {len(posts)} | Worth investigating: {len(worth)}")
+    
+    if worth:
+        print(f"\n=== Investigation Queue ===")
+        for t in worth:
+            author = t['author'].get('handle', '?')
+            claims = [c['type'] for c in t['claims']]
+            links = [l['type'] for l in t['investigatable_links']]
+            print(f"\n  [{t['priority']}] {author}")
+            print(f"    {t['text'][:150]}...")
+            if claims:
+                print(f"    Claims: {', '.join(claims)}")
+            if links:
+                print(f"    Links: {', '.join(links)}")
+            print(f"    Tasks: {len(t['tasks'])}")
+    else:
+        print("\nNo posts met the investigation threshold.")
+    
+    print(f"\nSaved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()