Feed Hunter: deep scraper skill, pipeline, simulator, first investigation
- Built deep-scraper skill (CDP-based X feed extraction) - Three-stage pipeline: scrape → triage → investigate - Paper trading simulator with position tracking - First live investigation: verified kch123 Polymarket profile ($9.3M P&L) - Opened first paper position: Seahawks Super Bowl @ 68c - Telegram alerts with inline action buttons - Portal build in progress (night shift)
This commit is contained in:
237
skills/deep-scraper/scripts/analyze-posts.py
Executable file
237
skills/deep-scraper/scripts/analyze-posts.py
Executable file
@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze scraped X/Twitter posts for money-making signals.
|
||||
Reads posts.json, classifies and scores each post.
|
||||
|
||||
Usage:
|
||||
python3 analyze-posts.py <path-to-posts.json> [--output analysis.json]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# Category keywords/patterns
|
||||
CATEGORIES = {
|
||||
"crypto": {
|
||||
"keywords": ["bitcoin", "btc", "ethereum", "eth", "solana", "sol", "crypto",
|
||||
"token", "defi", "dex", "nft", "airdrop", "memecoin", "altcoin",
|
||||
"bullish", "bearish", "pump", "dump", "moon", "hodl", "whale",
|
||||
"binance", "coinbase", "degen", "rug", "mint", "chain",
|
||||
"staking", "yield", "liquidity", "swap", "bridge"],
|
||||
"weight": 1.0
|
||||
},
|
||||
"polymarket": {
|
||||
"keywords": ["polymarket", "prediction market", "kalshi", "manifold",
|
||||
"betting market", "odds", "probability", "yes/no",
|
||||
"shares", "contract"],
|
||||
"weight": 1.0
|
||||
},
|
||||
"arbitrage": {
|
||||
"keywords": ["arbitrage", "arb", "spread", "price difference",
|
||||
"cross-exchange", "risk-free", "guaranteed profit",
|
||||
"mismatch", "exploit"],
|
||||
"weight": 1.0
|
||||
},
|
||||
"trading": {
|
||||
"keywords": ["long", "short", "leverage", "margin", "futures",
|
||||
"options", "calls", "puts", "entry", "exit", "target",
|
||||
"stop loss", "take profit", "chart", "technical analysis",
|
||||
"support", "resistance", "breakout", "reversal"],
|
||||
"weight": 0.8
|
||||
},
|
||||
"money_opportunity": {
|
||||
"keywords": ["free money", "easy money", "passive income", "side hustle",
|
||||
"make money", "earn", "profit", "roi", "returns",
|
||||
"alpha", "signal", "opportunity", "undervalued"],
|
||||
"weight": 0.7
|
||||
}
|
||||
}
|
||||
|
||||
# Spam/scam signals
|
||||
SPAM_SIGNALS = {
|
||||
"patterns": [
|
||||
r"dm me", r"link in bio", r"join my", r"guaranteed \d+%",
|
||||
r"100x", r"1000x", r"send .* to receive",
|
||||
r"whitelist", r"presale", r"limited spots",
|
||||
r"act now", r"don't miss", r"last chance",
|
||||
r"🚀{3,}", r"💰{3,}", r"🔥{3,}",
|
||||
r"follow.*retweet.*like", r"giveaway",
|
||||
r"drop.*wallet", r"reply.*address"
|
||||
],
|
||||
"weight": -1.0
|
||||
}
|
||||
|
||||
# Time sensitivity signals
|
||||
TIME_SENSITIVE = [
|
||||
r"ending (soon|today|tonight|in \d+)",
|
||||
r"last \d+ (hour|minute|day)",
|
||||
r"expires? (today|tonight|soon|in)",
|
||||
r"deadline",
|
||||
r"closing (soon|in)",
|
||||
r"only \d+ (left|remaining|spots)",
|
||||
r"window closing",
|
||||
r"before .* (ends|closes|expires)"
|
||||
]
|
||||
|
||||
|
||||
def classify_post(post):
|
||||
"""Classify a single post and return analysis."""
|
||||
text = ((post.get("text") or "") + " " +
|
||||
((post.get("card") or {}).get("title") or "") + " " +
|
||||
((post.get("card") or {}).get("description") or "")).lower()
|
||||
|
||||
# Category detection
|
||||
categories = {}
|
||||
for cat_name, cat_info in CATEGORIES.items():
|
||||
matches = [kw for kw in cat_info["keywords"] if kw in text]
|
||||
if matches:
|
||||
categories[cat_name] = {
|
||||
"matched": matches,
|
||||
"score": min(len(matches) * cat_info["weight"] * 0.2, 1.0)
|
||||
}
|
||||
|
||||
# Spam detection
|
||||
spam_matches = []
|
||||
for pattern in SPAM_SIGNALS["patterns"]:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
spam_matches.append(pattern)
|
||||
spam_score = min(len(spam_matches) * 0.25, 1.0)
|
||||
|
||||
# Time sensitivity
|
||||
time_sensitive = False
|
||||
time_matches = []
|
||||
for pattern in TIME_SENSITIVE:
|
||||
m = re.search(pattern, text, re.IGNORECASE)
|
||||
if m:
|
||||
time_sensitive = True
|
||||
time_matches.append(m.group(0))
|
||||
|
||||
# Engagement quality (high engagement = more likely legit)
|
||||
metrics = post.get("metrics", {})
|
||||
engagement_score = 0
|
||||
try:
|
||||
likes = int(str(metrics.get("likes", "0")).replace(",", ""))
|
||||
reposts = int(str(metrics.get("reposts", "0")).replace(",", ""))
|
||||
views = int(str(metrics.get("views", "0")).replace(",", ""))
|
||||
if views > 0:
|
||||
engagement_rate = (likes + reposts) / views
|
||||
engagement_score = min(engagement_rate * 100, 1.0)
|
||||
except (ValueError, ZeroDivisionError):
|
||||
pass
|
||||
|
||||
# Has external links (higher value for analysis)
|
||||
external_links = [l for l in post.get("links", [])
|
||||
if l.get("url", "").startswith("http") and "x.com" not in l.get("url", "")]
|
||||
|
||||
# Overall signal score
|
||||
category_score = max((c["score"] for c in categories.values()), default=0)
|
||||
signal_score = max(0, min(1.0,
|
||||
category_score * 0.4 +
|
||||
engagement_score * 0.2 +
|
||||
(0.1 if external_links else 0) +
|
||||
(0.1 if time_sensitive else 0) -
|
||||
spam_score * 0.3
|
||||
))
|
||||
|
||||
# Verdict
|
||||
if spam_score > 0.5:
|
||||
verdict = "likely_spam"
|
||||
elif signal_score > 0.5 and categories:
|
||||
verdict = "high_signal"
|
||||
elif signal_score > 0.25 and categories:
|
||||
verdict = "medium_signal"
|
||||
elif categories:
|
||||
verdict = "low_signal"
|
||||
else:
|
||||
verdict = "noise"
|
||||
|
||||
return {
|
||||
"author": post.get("author", {}),
|
||||
"text_preview": post.get("text", "")[:200],
|
||||
"url": post.get("url", ""),
|
||||
"categories": categories,
|
||||
"spam_score": round(spam_score, 2),
|
||||
"spam_matches": spam_matches,
|
||||
"time_sensitive": time_sensitive,
|
||||
"time_matches": time_matches,
|
||||
"engagement_score": round(engagement_score, 2),
|
||||
"external_links": external_links,
|
||||
"signal_score": round(signal_score, 2),
|
||||
"verdict": verdict,
|
||||
"timestamp": post.get("timestamp"),
|
||||
"metrics": metrics
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze X feed posts")
|
||||
parser.add_argument("input", help="Path to posts.json")
|
||||
parser.add_argument("--output", help="Output file (default: analysis.json in same dir)")
|
||||
parser.add_argument("--min-signal", type=float, default=0.0, help="Min signal score to include")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input) as f:
|
||||
data = json.load(f)
|
||||
|
||||
posts = data.get("posts", [])
|
||||
print(f"Analyzing {len(posts)} posts...")
|
||||
|
||||
analyses = []
|
||||
for post in posts:
|
||||
analysis = classify_post(post)
|
||||
if analysis["signal_score"] >= args.min_signal:
|
||||
analyses.append(analysis)
|
||||
|
||||
# Sort by signal score descending
|
||||
analyses.sort(key=lambda x: x["signal_score"], reverse=True)
|
||||
|
||||
# Stats
|
||||
verdicts = {}
|
||||
for a in analyses:
|
||||
v = a["verdict"]
|
||||
verdicts[v] = verdicts.get(v, 0) + 1
|
||||
|
||||
result = {
|
||||
"analyzed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"total_posts": len(posts),
|
||||
"analyzed_posts": len(analyses),
|
||||
"verdicts": verdicts,
|
||||
"posts": analyses
|
||||
}
|
||||
|
||||
# Output
|
||||
output_path = args.output
|
||||
if not output_path:
|
||||
import os
|
||||
output_path = os.path.join(os.path.dirname(args.input), "analysis.json")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== Analysis Summary ===")
|
||||
print(f"Total posts: {len(posts)}")
|
||||
for verdict, count in sorted(verdicts.items()):
|
||||
emoji = {"high_signal": "🟢", "medium_signal": "🟡", "low_signal": "⚪",
|
||||
"likely_spam": "🔴", "noise": "⚫"}.get(verdict, "❓")
|
||||
print(f" {emoji} {verdict}: {count}")
|
||||
|
||||
# Show top signals
|
||||
high = [a for a in analyses if a["verdict"] in ("high_signal", "medium_signal")]
|
||||
if high:
|
||||
print(f"\n=== Top Signals ===")
|
||||
for a in high[:10]:
|
||||
cats = ", ".join(a["categories"].keys())
|
||||
ts = "⏰" if a["time_sensitive"] else ""
|
||||
print(f" [{a['signal_score']:.2f}] {a['author'].get('handle', '?')} — {cats} {ts}")
|
||||
print(f" {a['text_preview'][:100]}...")
|
||||
|
||||
print(f"\nSaved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
42
skills/deep-scraper/scripts/launch-chrome-debug.sh
Executable file
42
skills/deep-scraper/scripts/launch-chrome-debug.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
# Launch Chrome with remote debugging for deep scraping
|
||||
# Uses a copy of the user's profile to enable debug port
|
||||
|
||||
PROFILE_SRC="/home/wdjones/.config/google-chrome"
|
||||
PROFILE_DBG="/home/wdjones/.config/google-chrome-debug"
|
||||
PORT=${1:-9222}
|
||||
|
||||
# Kill any existing debug chrome
|
||||
pkill -f "chrome-debug" 2>/dev/null
|
||||
|
||||
# Create/sync debug profile
|
||||
if [ ! -d "$PROFILE_DBG" ]; then
|
||||
echo "Creating debug profile (first time, this takes a moment)..."
|
||||
cp -r "$PROFILE_SRC" "$PROFILE_DBG"
|
||||
else
|
||||
# Sync cookies and local storage
|
||||
cp "$PROFILE_SRC/Default/Cookies" "$PROFILE_DBG/Default/Cookies" 2>/dev/null
|
||||
cp -r "$PROFILE_SRC/Default/Local Storage" "$PROFILE_DBG/Default/Local Storage" 2>/dev/null
|
||||
fi
|
||||
|
||||
rm -f "$PROFILE_DBG/SingletonLock" "$PROFILE_DBG/SingletonSocket" "$PROFILE_DBG/SingletonCookie" 2>/dev/null
|
||||
|
||||
DISPLAY=:0 /usr/bin/google-chrome-stable --no-sandbox \
|
||||
--user-data-dir="$PROFILE_DBG" \
|
||||
--remote-debugging-port=$PORT \
|
||||
--remote-allow-origins=* \
|
||||
https://x.com/home &>/dev/null &
|
||||
|
||||
echo "Chrome launched (PID $!, debug port $PORT)"
|
||||
echo "Waiting for port..."
|
||||
|
||||
for i in $(seq 1 15); do
|
||||
if curl -s "http://127.0.0.1:$PORT/json" >/dev/null 2>&1; then
|
||||
echo "Ready!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "ERROR: Port $PORT not ready after 15s"
|
||||
exit 1
|
||||
344
skills/deep-scraper/scripts/scrape-x-feed.py
Executable file
344
skills/deep-scraper/scripts/scrape-x-feed.py
Executable file
@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Deep scraper for X/Twitter feed — extracts structured post data via CDP.
|
||||
Connects to an existing Chrome instance with remote debugging enabled.
|
||||
|
||||
Usage:
|
||||
python3 scrape-x-feed.py [--port 9222] [--scroll-pages 5] [--output DIR]
|
||||
|
||||
Requires: Chrome running with --remote-debugging-port=9222
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
|
||||
import websocket
|
||||
|
||||
|
||||
def cdp_send(ws, method, params=None, msg_id=[0]):
|
||||
"""Send CDP command over an open WebSocket connection."""
|
||||
msg_id[0] += 1
|
||||
mid = msg_id[0]
|
||||
msg = {"id": mid, "method": method, "params": params or {}}
|
||||
ws.settimeout(15)
|
||||
ws.send(json.dumps(msg))
|
||||
for _ in range(100): # max 100 events before giving up
|
||||
try:
|
||||
resp = json.loads(ws.recv())
|
||||
except Exception:
|
||||
return {"error": "timeout"}
|
||||
if resp.get("id") == mid:
|
||||
return resp
|
||||
# Skip CDP events
|
||||
return {"error": "too many events"}
|
||||
|
||||
|
||||
def get_ws_url(port):
|
||||
"""Get WebSocket debugger URL from Chrome DevTools."""
|
||||
url = f"http://127.0.0.1:{port}/json"
|
||||
resp = urllib.request.urlopen(url, timeout=5)
|
||||
tabs = json.loads(resp.read())
|
||||
for tab in tabs:
|
||||
if "x.com" in tab.get("url", ""):
|
||||
return tab["webSocketDebuggerUrl"]
|
||||
# Fallback to first tab
|
||||
if tabs:
|
||||
return tabs[0]["webSocketDebuggerUrl"]
|
||||
raise RuntimeError("No Chrome tabs found")
|
||||
|
||||
|
||||
# JavaScript to extract tweets from the DOM
|
||||
EXTRACT_JS = r"""
|
||||
(() => {
|
||||
const posts = [];
|
||||
const seen = new Set();
|
||||
|
||||
// X/Twitter uses article elements for tweets
|
||||
const articles = document.querySelectorAll('article[data-testid="tweet"]');
|
||||
|
||||
for (const article of articles) {
|
||||
try {
|
||||
// Author info
|
||||
const userLinks = article.querySelectorAll('a[role="link"]');
|
||||
let displayName = '';
|
||||
let handle = '';
|
||||
for (const link of userLinks) {
|
||||
const href = link.getAttribute('href') || '';
|
||||
if (href.match(/^\/[a-zA-Z0-9_]+$/) && !href.includes('/status/')) {
|
||||
if (!handle) handle = href.replace('/', '@');
|
||||
const nameEl = link.querySelector('span');
|
||||
if (nameEl && !displayName) displayName = nameEl.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Tweet text
|
||||
const textEl = article.querySelector('[data-testid="tweetText"]');
|
||||
const text = textEl ? textEl.textContent.trim() : '';
|
||||
|
||||
// Skip if we've seen this exact text+author combo
|
||||
const key = `${handle}:${text.slice(0, 50)}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
// Timestamp
|
||||
const timeEl = article.querySelector('time');
|
||||
const timestamp = timeEl ? timeEl.getAttribute('datetime') : null;
|
||||
const timeText = timeEl ? timeEl.textContent.trim() : '';
|
||||
|
||||
// Link to tweet
|
||||
let tweetUrl = '';
|
||||
const statusLinks = article.querySelectorAll('a[href*="/status/"]');
|
||||
for (const sl of statusLinks) {
|
||||
const href = sl.getAttribute('href') || '';
|
||||
if (href.match(/\/status\/\d+$/)) {
|
||||
tweetUrl = 'https://x.com' + href;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Engagement metrics
|
||||
const metrics = {};
|
||||
const groups = article.querySelectorAll('[role="group"]');
|
||||
for (const group of groups) {
|
||||
const buttons = group.querySelectorAll('button');
|
||||
for (const btn of buttons) {
|
||||
const label = btn.getAttribute('aria-label') || '';
|
||||
if (label.includes('repl')) {
|
||||
const m = label.match(/(\d[\d,.]*)/);
|
||||
if (m) metrics.replies = m[1];
|
||||
} else if (label.includes('repost') || label.includes('Repost')) {
|
||||
const m = label.match(/(\d[\d,.]*)/);
|
||||
if (m) metrics.reposts = m[1];
|
||||
} else if (label.includes('like') || label.includes('Like')) {
|
||||
const m = label.match(/(\d[\d,.]*)/);
|
||||
if (m) metrics.likes = m[1];
|
||||
} else if (label.includes('view') || label.includes('View')) {
|
||||
const m = label.match(/(\d[\d,.]*)/);
|
||||
if (m) metrics.views = m[1];
|
||||
} else if (label.includes('bookmark') || label.includes('Bookmark')) {
|
||||
const m = label.match(/(\d[\d,.]*)/);
|
||||
if (m) metrics.bookmarks = m[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Embedded links
|
||||
const links = [];
|
||||
if (textEl) {
|
||||
const anchors = textEl.querySelectorAll('a');
|
||||
for (const a of anchors) {
|
||||
const href = a.getAttribute('href') || '';
|
||||
const linkText = a.textContent.trim();
|
||||
if (href && !href.startsWith('/')) {
|
||||
links.push({ url: href, text: linkText });
|
||||
} else if (href.startsWith('/')) {
|
||||
links.push({ url: 'https://x.com' + href, text: linkText });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Media (images/video indicators)
|
||||
const media = [];
|
||||
const imgs = article.querySelectorAll('[data-testid="tweetPhoto"] img');
|
||||
for (const img of imgs) {
|
||||
media.push({ type: 'image', src: img.src });
|
||||
}
|
||||
const videoEl = article.querySelector('[data-testid="videoPlayer"]');
|
||||
if (videoEl) media.push({ type: 'video' });
|
||||
|
||||
// Card/preview
|
||||
const card = article.querySelector('[data-testid="card.wrapper"]');
|
||||
let cardData = null;
|
||||
if (card) {
|
||||
const cardTitle = card.querySelector('[data-testid="card.layoutLarge.title"], [data-testid="card.layoutSmall.title"]');
|
||||
const cardDesc = card.querySelector('[data-testid="card.layoutLarge.description"], [data-testid="card.layoutSmall.description"]');
|
||||
const cardLink = card.querySelector('a');
|
||||
cardData = {
|
||||
title: cardTitle ? cardTitle.textContent.trim() : null,
|
||||
description: cardDesc ? cardDesc.textContent.trim() : null,
|
||||
url: cardLink ? cardLink.getAttribute('href') : null
|
||||
};
|
||||
}
|
||||
|
||||
// Is it a repost?
|
||||
const socialContext = article.querySelector('[data-testid="socialContext"]');
|
||||
let repostBy = null;
|
||||
if (socialContext && socialContext.textContent.includes('reposted')) {
|
||||
repostBy = socialContext.textContent.replace(' reposted', '').trim();
|
||||
}
|
||||
|
||||
posts.push({
|
||||
author: { displayName, handle },
|
||||
text,
|
||||
timestamp,
|
||||
timeText,
|
||||
url: tweetUrl,
|
||||
metrics,
|
||||
links,
|
||||
media,
|
||||
card: cardData,
|
||||
repostBy
|
||||
});
|
||||
} catch (e) {
|
||||
// Skip malformed tweets
|
||||
}
|
||||
}
|
||||
|
||||
return JSON.stringify(posts);
|
||||
})()
|
||||
"""
|
||||
|
||||
|
||||
def scrape_via_cdp(port, scroll_pages, output_dir):
|
||||
"""Scrape X feed using Chrome DevTools Protocol."""
|
||||
ws_url = get_ws_url(port)
|
||||
print(f"Connecting to: {ws_url}")
|
||||
ws = websocket.create_connection(ws_url, timeout=30)
|
||||
print("Connected!")
|
||||
|
||||
all_posts = []
|
||||
seen_keys = set()
|
||||
|
||||
for page in range(scroll_pages):
|
||||
print(f"Scraping page {page + 1}/{scroll_pages}...")
|
||||
|
||||
# Execute extraction JS
|
||||
result = cdp_send(ws, "Runtime.evaluate", {
|
||||
"expression": EXTRACT_JS,
|
||||
"returnByValue": True
|
||||
})
|
||||
|
||||
value = result.get("result", {}).get("result", {}).get("value", "[]")
|
||||
posts = json.loads(value) if isinstance(value, str) else []
|
||||
|
||||
# Deduplicate
|
||||
new = 0
|
||||
for post in posts:
|
||||
key = f"{post['author']['handle']}:{post['text'][:80]}"
|
||||
if key not in seen_keys:
|
||||
seen_keys.add(key)
|
||||
all_posts.append(post)
|
||||
new += 1
|
||||
|
||||
print(f" Found {len(posts)} posts ({new} new)")
|
||||
|
||||
if page < scroll_pages - 1:
|
||||
# Scroll down
|
||||
cdp_send(ws, "Runtime.evaluate", {
|
||||
"expression": "window.scrollBy(0, window.innerHeight * 2)"
|
||||
})
|
||||
time.sleep(3)
|
||||
|
||||
ws.close()
|
||||
|
||||
# Save output
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
run_dir = os.path.join(output_dir, timestamp)
|
||||
os.makedirs(run_dir, exist_ok=True)
|
||||
|
||||
output_file = os.path.join(run_dir, "posts.json")
|
||||
with open(output_file, "w") as f:
|
||||
json.dump({
|
||||
"timestamp": timestamp,
|
||||
"total_posts": len(all_posts),
|
||||
"posts": all_posts
|
||||
}, f, indent=2)
|
||||
|
||||
# Also save a human-readable summary
|
||||
summary_file = os.path.join(run_dir, "summary.md")
|
||||
with open(summary_file, "w") as f:
|
||||
f.write(f"# X Feed Scrape — {timestamp}\n\n")
|
||||
f.write(f"**Total posts:** {len(all_posts)}\n\n")
|
||||
for i, post in enumerate(all_posts, 1):
|
||||
author = post['author']
|
||||
f.write(f"## {i}. {author['displayName']} ({author['handle']})\n")
|
||||
if post.get('repostBy'):
|
||||
f.write(f"*Reposted by {post['repostBy']}*\n")
|
||||
f.write(f"\n{post['text']}\n\n")
|
||||
if post.get('metrics'):
|
||||
m = post['metrics']
|
||||
parts = []
|
||||
for k, v in m.items():
|
||||
parts.append(f"{k}: {v}")
|
||||
f.write(f"📊 {' | '.join(parts)}\n")
|
||||
if post.get('links'):
|
||||
f.write(f"\n🔗 Links:\n")
|
||||
for link in post['links']:
|
||||
f.write(f" - [{link['text']}]({link['url']})\n")
|
||||
if post.get('card'):
|
||||
c = post['card']
|
||||
f.write(f"\n📎 Card: {c.get('title', 'N/A')}\n")
|
||||
if c.get('description'):
|
||||
f.write(f" {c['description']}\n")
|
||||
if post.get('url'):
|
||||
f.write(f"\n🔗 {post['url']}\n")
|
||||
f.write(f"\n---\n\n")
|
||||
|
||||
print(f"\nDone! {len(all_posts)} posts saved to {run_dir}/")
|
||||
print(f" posts.json — structured data")
|
||||
print(f" summary.md — human-readable")
|
||||
return run_dir
|
||||
|
||||
|
||||
def scrape_via_xdotool(scroll_pages, output_dir):
|
||||
"""Fallback: use xdotool + JS injection via xdg approach.
|
||||
This launches Chrome with remote debugging if not already running."""
|
||||
|
||||
# Check if Chrome is running with debugging port
|
||||
try:
|
||||
urllib.request.urlopen("http://127.0.0.1:9222/json", timeout=2)
|
||||
return scrape_via_cdp(9222, scroll_pages, output_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Launch Chrome with remote debugging
|
||||
print("Launching Chrome with remote debugging...")
|
||||
subprocess.Popen([
|
||||
"/usr/bin/google-chrome-stable",
|
||||
"--no-sandbox",
|
||||
"--user-data-dir=/home/wdjones/.config/google-chrome",
|
||||
"--remote-debugging-port=9222",
|
||||
"https://x.com/home"
|
||||
], env={**os.environ, "DISPLAY": ":0"},
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
# Wait for Chrome to be ready
|
||||
for _ in range(20):
|
||||
try:
|
||||
urllib.request.urlopen("http://127.0.0.1:9222/json", timeout=2)
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
else:
|
||||
print("ERROR: Chrome didn't start with debugging port", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
time.sleep(5) # Let page load
|
||||
return scrape_via_cdp(9222, scroll_pages, output_dir)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Deep scrape X/Twitter feed")
|
||||
parser.add_argument("--port", type=int, default=9222, help="Chrome debugging port")
|
||||
parser.add_argument("--scroll-pages", type=int, default=5, help="Number of scroll pages")
|
||||
parser.add_argument("--output", default="/home/wdjones/.openclaw/workspace/data/x-feed",
|
||||
help="Output directory")
|
||||
parser.add_argument("--launch", action="store_true", help="Launch Chrome if not running")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
|
||||
if args.launch:
|
||||
scrape_via_xdotool(args.scroll_pages, args.output)
|
||||
else:
|
||||
scrape_via_cdp(args.port, args.scroll_pages, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
281
skills/deep-scraper/scripts/triage-posts.py
Normal file
281
skills/deep-scraper/scripts/triage-posts.py
Normal file
@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Triage scraped X posts — identify posts with verifiable claims and links.
|
||||
Extracts structured "investigation tasks" for agent follow-up.
|
||||
|
||||
Usage:
|
||||
python3 triage-posts.py <posts.json> [--output triage.json]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# Patterns that suggest a verifiable claim
|
||||
CLAIM_PATTERNS = [
|
||||
# Performance claims
|
||||
(r'(\d+[\d.]*)\s*%\s*(win|success|profit|return|accuracy|hit rate)', 'performance_claim'),
|
||||
(r'wins?\s+(\d+[\d.]*)\s*%', 'performance_claim'),
|
||||
(r'(\d+[\d.]*)\s*%\s+of the time', 'performance_claim'),
|
||||
(r'(\d+[\d.]*)x\s+(return|profit|gain)', 'multiplier_claim'),
|
||||
|
||||
# Copy/follow trading
|
||||
(r'copy(ing|cat)?\s+(trader|user|bet|position|strat)', 'copy_trading'),
|
||||
(r'follow\s+(this|my|their)\s+(trade|bet|position|strat)', 'copy_trading'),
|
||||
(r'mirror(ing)?\s+(trade|bet|position)', 'copy_trading'),
|
||||
|
||||
# Arbitrage/spread
|
||||
(r'(arb|arbitrage|spread|mismatch|mispriced)', 'arbitrage_opp'),
|
||||
(r'risk[\s-]?free', 'arbitrage_opp'),
|
||||
(r'guaranteed\s+(profit|return|money)', 'arbitrage_opp'),
|
||||
|
||||
# Prediction/betting
|
||||
(r'(polymarket|kalshi|manifold|prediction\s+market)', 'prediction_market'),
|
||||
(r'(odds|probability)\s+.{0,20}\s*(\d+[\d.]*)\s*%', 'odds_claim'),
|
||||
(r'(yes|no)\s+shares?\s+at\s+(\d+)', 'shares_price'),
|
||||
|
||||
# Price/target claims
|
||||
(r'(target|pt|price target)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'price_target'),
|
||||
(r'(entry|buy)\s*(at|zone|point)\s*[:\s]+\$?\s*(\d[\d,.]*)', 'entry_point'),
|
||||
|
||||
# Airdrop/free money
|
||||
(r'(airdrop|free\s+money|free\s+tokens?|claiming)', 'airdrop'),
|
||||
(r'(step\s+\d|how\s+to\s+(get|claim|earn))', 'howto'),
|
||||
|
||||
# User/account references
|
||||
(r'@\w+.*?(portfolio|track\s*record|history|performance)', 'user_reference'),
|
||||
(r'(this\s+(guy|trader|user|person|account))\s+.{0,30}(profit|win|return|made)', 'user_reference'),
|
||||
]
|
||||
|
||||
# Link domains that are investigatable
|
||||
INVESTIGATABLE_DOMAINS = {
|
||||
'polymarket.com': 'prediction_market',
|
||||
'kalshi.com': 'prediction_market',
|
||||
'manifold.markets': 'prediction_market',
|
||||
'dexscreener.com': 'token_chart',
|
||||
'dextools.io': 'token_chart',
|
||||
'birdeye.so': 'token_chart',
|
||||
'coingecko.com': 'token_info',
|
||||
'coinmarketcap.com': 'token_info',
|
||||
'tradingview.com': 'chart',
|
||||
'etherscan.io': 'blockchain',
|
||||
'solscan.io': 'blockchain',
|
||||
'basescan.org': 'blockchain',
|
||||
'github.com': 'code_repo',
|
||||
'docs.google.com': 'document',
|
||||
'notion.so': 'document',
|
||||
'medium.com': 'article',
|
||||
'substack.com': 'article',
|
||||
'youtube.com': 'video',
|
||||
'youtu.be': 'video',
|
||||
}
|
||||
|
||||
|
||||
def extract_claims(text):
|
||||
"""Extract verifiable claims from post text."""
|
||||
claims = []
|
||||
for pattern, claim_type in CLAIM_PATTERNS:
|
||||
matches = re.finditer(pattern, text, re.IGNORECASE)
|
||||
for m in matches:
|
||||
claims.append({
|
||||
'type': claim_type,
|
||||
'match': m.group(0),
|
||||
'span': [m.start(), m.end()],
|
||||
})
|
||||
return claims
|
||||
|
||||
|
||||
def classify_links(links):
|
||||
"""Classify links by investigatable domain."""
|
||||
classified = []
|
||||
for link in links:
|
||||
url = link.get('url', '')
|
||||
link_type = 'unknown'
|
||||
for domain, dtype in INVESTIGATABLE_DOMAINS.items():
|
||||
if domain in url:
|
||||
link_type = dtype
|
||||
break
|
||||
classified.append({
|
||||
'url': url,
|
||||
'text': link.get('text', ''),
|
||||
'type': link_type,
|
||||
})
|
||||
return classified
|
||||
|
||||
|
||||
def triage_post(post):
|
||||
"""Analyze a single post for investigation potential."""
|
||||
text = post.get('text', '') or ''
|
||||
|
||||
# Extract claims
|
||||
claims = extract_claims(text)
|
||||
|
||||
# Classify links
|
||||
all_links = (post.get('links', []) or [])
|
||||
card = post.get('card') or {}
|
||||
if card.get('url'):
|
||||
all_links.append({'url': card['url'], 'text': card.get('title', '')})
|
||||
classified_links = classify_links(all_links)
|
||||
investigatable_links = [l for l in classified_links if l['type'] != 'unknown']
|
||||
|
||||
# Has the post got something worth investigating?
|
||||
has_claims = len(claims) > 0
|
||||
has_links = len(investigatable_links) > 0
|
||||
has_any_links = len(classified_links) > 0
|
||||
|
||||
# Priority scoring
|
||||
priority = 0
|
||||
if has_claims:
|
||||
priority += 2
|
||||
# Performance claims are highest priority
|
||||
if any(c['type'] in ('performance_claim', 'multiplier_claim') for c in claims):
|
||||
priority += 2
|
||||
if any(c['type'] in ('copy_trading', 'arbitrage_opp') for c in claims):
|
||||
priority += 1
|
||||
if has_links:
|
||||
priority += 2
|
||||
if any(l['type'] == 'prediction_market' for l in investigatable_links):
|
||||
priority += 2
|
||||
if any(l['type'] in ('token_chart', 'blockchain') for l in investigatable_links):
|
||||
priority += 1
|
||||
if has_any_links and not has_links:
|
||||
priority += 1
|
||||
|
||||
# Build investigation tasks
|
||||
tasks = []
|
||||
|
||||
for claim in claims:
|
||||
if claim['type'] == 'performance_claim':
|
||||
tasks.append({
|
||||
'action': 'verify_performance',
|
||||
'description': f"Verify claim: {claim['match']}",
|
||||
'method': 'Check linked profile/data for actual track record',
|
||||
})
|
||||
elif claim['type'] == 'copy_trading':
|
||||
tasks.append({
|
||||
'action': 'verify_trader',
|
||||
'description': f"Verify trader referenced: {claim['match']}",
|
||||
'method': 'Check trader profile, recent bets/trades, actual P&L',
|
||||
})
|
||||
elif claim['type'] == 'arbitrage_opp':
|
||||
tasks.append({
|
||||
'action': 'verify_arb',
|
||||
'description': f"Verify opportunity: {claim['match']}",
|
||||
'method': 'Check if spread/mismatch still exists, calculate actual risk',
|
||||
})
|
||||
elif claim['type'] in ('odds_claim', 'shares_price', 'prediction_market'):
|
||||
tasks.append({
|
||||
'action': 'check_market',
|
||||
'description': f"Check prediction market: {claim['match']}",
|
||||
'method': 'Verify current odds, volume, resolution criteria',
|
||||
})
|
||||
elif claim['type'] == 'price_target':
|
||||
tasks.append({
|
||||
'action': 'check_price',
|
||||
'description': f"Verify price claim: {claim['match']}",
|
||||
'method': 'Check current price vs target, chart pattern',
|
||||
})
|
||||
elif claim['type'] == 'airdrop':
|
||||
tasks.append({
|
||||
'action': 'verify_airdrop',
|
||||
'description': f"Check airdrop legitimacy: {claim['match']}",
|
||||
'method': 'Verify project, check for scam signals, confirm eligibility',
|
||||
})
|
||||
|
||||
for link in investigatable_links:
|
||||
tasks.append({
|
||||
'action': f'browse_{link["type"]}',
|
||||
'description': f'Follow and analyze: {link["url"]}',
|
||||
'url': link['url'],
|
||||
'method': f'Browse to {link["type"]} link, extract current data',
|
||||
})
|
||||
|
||||
# For unknown links that might be interesting
|
||||
for link in classified_links:
|
||||
if link['type'] == 'unknown' and link['url'].startswith('http') and 'x.com' not in link['url']:
|
||||
tasks.append({
|
||||
'action': 'browse_unknown',
|
||||
'description': f'Check external link: {link["url"]}',
|
||||
'url': link['url'],
|
||||
'method': 'Browse to link, determine if relevant',
|
||||
})
|
||||
|
||||
return {
|
||||
'author': post.get('author', {}),
|
||||
'text': text,
|
||||
'url': post.get('url', ''),
|
||||
'timestamp': post.get('timestamp'),
|
||||
'metrics': post.get('metrics', {}),
|
||||
'claims': claims,
|
||||
'links': classified_links,
|
||||
'investigatable_links': investigatable_links,
|
||||
'priority': priority,
|
||||
'tasks': tasks,
|
||||
'worth_investigating': priority >= 2,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Triage X posts for investigation")
|
||||
parser.add_argument("input", help="Path to posts.json")
|
||||
parser.add_argument("--output", help="Output file")
|
||||
parser.add_argument("--min-priority", type=int, default=2, help="Min priority to include")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input) as f:
|
||||
data = json.load(f)
|
||||
|
||||
posts = data.get('posts', [])
|
||||
print(f"Triaging {len(posts)} posts...")
|
||||
|
||||
triaged = []
|
||||
for post in posts:
|
||||
result = triage_post(post)
|
||||
triaged.append(result)
|
||||
|
||||
# Sort by priority
|
||||
triaged.sort(key=lambda x: x['priority'], reverse=True)
|
||||
|
||||
# Filter
|
||||
worth = [t for t in triaged if t['priority'] >= args.min_priority]
|
||||
|
||||
output = {
|
||||
'triaged_at': datetime.now(timezone.utc).isoformat(),
|
||||
'total_posts': len(posts),
|
||||
'worth_investigating': len(worth),
|
||||
'posts': triaged,
|
||||
'investigation_queue': worth,
|
||||
}
|
||||
|
||||
output_path = args.output or os.path.join(os.path.dirname(args.input), 'triage.json')
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== Triage Summary ===")
|
||||
print(f"Total: {len(posts)} | Worth investigating: {len(worth)}")
|
||||
|
||||
if worth:
|
||||
print(f"\n=== Investigation Queue ===")
|
||||
for t in worth:
|
||||
author = t['author'].get('handle', '?')
|
||||
claims = [c['type'] for c in t['claims']]
|
||||
links = [l['type'] for l in t['investigatable_links']]
|
||||
print(f"\n [{t['priority']}] {author}")
|
||||
print(f" {t['text'][:150]}...")
|
||||
if claims:
|
||||
print(f" Claims: {', '.join(claims)}")
|
||||
if links:
|
||||
print(f" Links: {', '.join(links)}")
|
||||
print(f" Tasks: {len(t['tasks'])}")
|
||||
else:
|
||||
print("\nNo posts met the investigation threshold.")
|
||||
|
||||
print(f"\nSaved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user