Playwright X scraper, daily notes, feed analysis

2026-02-09 17:26:02 -06:00
parent be0315894e
commit 6592590dac
10 changed files with 37672 additions and 252 deletions
--- a/projects/feed-hunter/x_scraper_pw.py
+++ b/projects/feed-hunter/x_scraper_pw.py
@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+X/Twitter Feed Scraper using Playwright
+Scrapes specific accounts for trading-related posts.
+Uses saved Chrome session cookies for authentication.
+"""
+
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+ACCOUNTS = [
+    "browomo", "ArchiveExplorer", "noisyb0y1", "krajekis",
+    "Shelpid_WI3M", "polyaboretum", "0xashensoul",
+]
+
+TRADING_KEYWORDS = [
+    "polymarket", "trade", "profit", "wallet", "arbitrage", "signal",
+    "crypto", "bitcoin", "ethereum", "solana", "strategy", "edge",
+    "bet", "position", "stock", "market", "pnl", "alpha",
+    "$", "usdc", "defi", "token", "copy", "whale", "degen",
+    "short", "long", "bullish", "bearish", "portfolio",
+]
+
+DATA_DIR = Path(__file__).parent.parent / "data" / "x-feed"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+COOKIE_FILE = Path(__file__).parent / "x_cookies.json"
+TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
+TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "6443752046")
+
+
+def send_telegram(message):
+    if not TELEGRAM_BOT_TOKEN:
+        print(f"[ALERT] {message}")
+        return
+    import urllib.request
+    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
+    data = json.dumps({"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "HTML"}).encode()
+    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
+    try:
+        urllib.request.urlopen(req, timeout=10)
+    except Exception as e:
+        print(f"Telegram error: {e}")
+
+
+def save_cookies(context):
+    cookies = context.cookies()
+    COOKIE_FILE.write_text(json.dumps(cookies, indent=2))
+    print(f"Saved {len(cookies)} cookies")
+
+
+def load_cookies(context):
+    if COOKIE_FILE.exists():
+        cookies = json.loads(COOKIE_FILE.read_text())
+        context.add_cookies(cookies)
+        print(f"Loaded {len(cookies)} cookies")
+        return True
+    return False
+
+
+def export_cookies_from_chrome():
+    """One-time: grab cookies from the running Chrome debug instance."""
+    import http.client, websocket as ws_mod
+    conn = http.client.HTTPConnection("localhost", 9222)
+    conn.request("GET", "/json")
+    tabs = json.loads(conn.getresponse().read())
+    
+    x_tab = None
+    for t in tabs:
+        if "x.com" in t.get("url", ""):
+            x_tab = t
+            break
+    
+    if not x_tab:
+        print("No X tab found in Chrome debug")
+        return []
+    
+    ws = ws_mod.create_connection(x_tab["webSocketDebuggerUrl"], timeout=10)
+    ws.send(json.dumps({"id": 1, "method": "Network.getAllCookies"}))
+    result = json.loads(ws.recv())
+    all_cookies = result.get("result", {}).get("cookies", [])
+    ws.close()
+    
+    # Filter for x.com cookies and convert to Playwright format
+    x_cookies = []
+    for c in all_cookies:
+        if "x.com" in c.get("domain", "") or "twitter.com" in c.get("domain", ""):
+            x_cookies.append({
+                "name": c["name"],
+                "value": c["value"],
+                "domain": c["domain"],
+                "path": c.get("path", "/"),
+                "secure": c.get("secure", False),
+                "httpOnly": c.get("httpOnly", False),
+                "sameSite": c.get("sameSite", "Lax"),
+            })
+    
+    COOKIE_FILE.write_text(json.dumps(x_cookies, indent=2))
+    print(f"Exported {len(x_cookies)} X cookies from Chrome")
+    return x_cookies
+
+
+def scrape_account(page, account, max_scroll=5):
+    """Scrape recent posts from a single account."""
+    posts = []
+    
+    try:
+        page.goto(f"https://x.com/{account}", wait_until="networkidle", timeout=15000)
+    except:
+        try:
+            page.goto(f"https://x.com/{account}", wait_until="domcontentloaded", timeout=10000)
+            page.wait_for_timeout(3000)
+        except Exception as e:
+            print(f"  Failed to load @{account}: {e}")
+            return posts
+    
+    seen_texts = set()
+    
+    for scroll in range(max_scroll):
+        articles = page.query_selector_all("article")
+        
+        for article in articles:
+            try:
+                text = article.inner_text()[:800]
+                # Deduplicate
+                sig = text[:100]
+                if sig in seen_texts:
+                    continue
+                seen_texts.add(sig)
+                
+                # Extract links
+                links = article.query_selector_all("a")
+                urls = [l.get_attribute("href") for l in links if l.get_attribute("href")]
+                
+                posts.append({
+                    "account": account,
+                    "text": text,
+                    "urls": urls[:5],
+                    "scraped_at": datetime.now(timezone.utc).isoformat(),
+                })
+            except:
+                continue
+        
+        # Scroll down
+        page.evaluate("window.scrollBy(0, 1500)")
+        page.wait_for_timeout(1500)
+    
+    return posts
+
+
+def is_trading_related(text):
+    text_lower = text.lower()
+    return any(kw in text_lower for kw in TRADING_KEYWORDS)
+
+
+def main():
+    from playwright.sync_api import sync_playwright
+    
+    print(f"=== X Feed Scraper (Playwright) ===")
+    print(f"Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
+    
+    # Export cookies from Chrome if we don't have them yet
+    if not COOKIE_FILE.exists():
+        print("No cookies found — exporting from Chrome debug session...")
+        export_cookies_from_chrome()
+    
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={"width": 1280, "height": 900},
+        )
+        
+        load_cookies(context)
+        page = context.new_page()
+        
+        all_posts = []
+        trading_posts = []
+        
+        for account in ACCOUNTS:
+            print(f"\nScraping @{account}...", end=" ", flush=True)
+            posts = scrape_account(page, account)
+            print(f"{len(posts)} posts")
+            
+            for post in posts:
+                all_posts.append(post)
+                if is_trading_related(post["text"]):
+                    trading_posts.append(post)
+        
+        browser.close()
+    
+    print(f"\n{'='*50}")
+    print(f"Total posts: {len(all_posts)}")
+    print(f"Trading-related: {len(trading_posts)}")
+    
+    # Save results
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M")
+    out_file = DATA_DIR / f"scan-{timestamp}.json"
+    out_file.write_text(json.dumps({
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "total_posts": len(all_posts),
+        "trading_posts": len(trading_posts),
+        "posts": trading_posts,
+    }, indent=2))
+    print(f"Saved to {out_file}")
+    
+    # Check for new posts we haven't seen before
+    seen_file = DATA_DIR / "seen_posts.json"
+    seen = set()
+    if seen_file.exists():
+        try:
+            seen = set(json.loads(seen_file.read_text()))
+        except:
+            pass
+    
+    new_posts = []
+    for post in trading_posts:
+        sig = post["text"][:150]
+        if sig not in seen:
+            new_posts.append(post)
+            seen.add(sig)
+    
+    seen_file.write_text(json.dumps(list(seen)[-5000:]))  # Keep last 5000
+    
+    if new_posts:
+        print(f"\n🔔 {len(new_posts)} NEW trading posts!")
+        for post in new_posts[:5]:
+            lines = post["text"].split("\n")
+            author = f"@{post['account']}"
+            snippet = post["text"][:200].replace("\n", " ")
+            print(f"\n  {author}: {snippet}")
+            
+            # Alert on Telegram
+            msg = f"🔍 <b>New from {author}</b>\n\n{snippet[:300]}"
+            if post.get("urls"):
+                x_urls = [u for u in post["urls"] if "x.com" in u or "twitter.com" in u]
+                if x_urls:
+                    msg += f"\n\n{x_urls[0]}"
+            send_telegram(msg)
+    else:
+        print("\nNo new trading posts since last scan.")
+
+
+if __name__ == "__main__":
+    main()