workspace/projects/feed-hunter/x_scraper_pw.py

#!/usr/bin/env python3
"""
X/Twitter Feed Scraper using Playwright
Scrapes specific accounts for trading-related posts.
Uses saved Chrome session cookies for authentication.
"""

import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

ACCOUNTS = [
    "browomo", "ArchiveExplorer", "noisyb0y1", "krajekis",
    "Shelpid_WI3M", "polyaboretum", "0xashensoul",
]

TRADING_KEYWORDS = [
    "polymarket", "trade", "profit", "wallet", "arbitrage", "signal",
    "crypto", "bitcoin", "ethereum", "solana", "strategy", "edge",
    "bet", "position", "stock", "market", "pnl", "alpha",
    "$", "usdc", "defi", "token", "copy", "whale", "degen",
    "short", "long", "bullish", "bearish", "portfolio",
]

DATA_DIR = Path(__file__).parent.parent / "data" / "x-feed"
DATA_DIR.mkdir(parents=True, exist_ok=True)

COOKIE_FILE = Path(__file__).parent / "x_cookies.json"
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "6443752046")


def send_telegram(message):
    if not TELEGRAM_BOT_TOKEN:
        print(f"[ALERT] {message}")
        return
    import urllib.request
    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
    data = json.dumps({"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "HTML"}).encode()
    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
    try:
        urllib.request.urlopen(req, timeout=10)
    except Exception as e:
        print(f"Telegram error: {e}")


def save_cookies(context):
    cookies = context.cookies()
    COOKIE_FILE.write_text(json.dumps(cookies, indent=2))
    print(f"Saved {len(cookies)} cookies")


def load_cookies(context):
    if COOKIE_FILE.exists():
        cookies = json.loads(COOKIE_FILE.read_text())
        context.add_cookies(cookies)
        print(f"Loaded {len(cookies)} cookies")
        return True
    return False


def export_cookies_from_chrome():
    """One-time: grab cookies from the running Chrome debug instance."""
    import http.client, websocket as ws_mod
    conn = http.client.HTTPConnection("localhost", 9222)
    conn.request("GET", "/json")
    tabs = json.loads(conn.getresponse().read())

    x_tab = None
    for t in tabs:
        if "x.com" in t.get("url", ""):
            x_tab = t
            break

    if not x_tab:
        print("No X tab found in Chrome debug")
        return []

    ws = ws_mod.create_connection(x_tab["webSocketDebuggerUrl"], timeout=10)
    ws.send(json.dumps({"id": 1, "method": "Network.getAllCookies"}))
    result = json.loads(ws.recv())
    all_cookies = result.get("result", {}).get("cookies", [])
    ws.close()

    # Filter for x.com cookies and convert to Playwright format
    x_cookies = []
    for c in all_cookies:
        if "x.com" in c.get("domain", "") or "twitter.com" in c.get("domain", ""):
            x_cookies.append({
                "name": c["name"],
                "value": c["value"],
                "domain": c["domain"],
                "path": c.get("path", "/"),
                "secure": c.get("secure", False),
                "httpOnly": c.get("httpOnly", False),
                "sameSite": c.get("sameSite", "Lax"),
            })

    COOKIE_FILE.write_text(json.dumps(x_cookies, indent=2))
    print(f"Exported {len(x_cookies)} X cookies from Chrome")
    return x_cookies


def scrape_account(page, account, max_scroll=5):
    """Scrape recent posts from a single account."""
    posts = []

    try:
        page.goto(f"https://x.com/{account}", wait_until="networkidle", timeout=15000)
    except:
        try:
            page.goto(f"https://x.com/{account}", wait_until="domcontentloaded", timeout=10000)
            page.wait_for_timeout(3000)
        except Exception as e:
            print(f"  Failed to load @{account}: {e}")
            return posts

    seen_texts = set()

    for scroll in range(max_scroll):
        articles = page.query_selector_all("article")

        for article in articles:
            try:
                text = article.inner_text()[:800]
                # Deduplicate
                sig = text[:100]
                if sig in seen_texts:
                    continue
                seen_texts.add(sig)

                # Extract links
                links = article.query_selector_all("a")
                urls = [l.get_attribute("href") for l in links if l.get_attribute("href")]

                posts.append({
                    "account": account,
                    "text": text,
                    "urls": urls[:5],
                    "scraped_at": datetime.now(timezone.utc).isoformat(),
                })
            except:
                continue

        # Scroll down
        page.evaluate("window.scrollBy(0, 1500)")
        page.wait_for_timeout(1500)

    return posts


def is_trading_related(text):
    text_lower = text.lower()
    return any(kw in text_lower for kw in TRADING_KEYWORDS)


def main():
    from playwright.sync_api import sync_playwright

    print(f"=== X Feed Scraper (Playwright) ===")
    print(f"Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")

    # Export cookies from Chrome if we don't have them yet
    if not COOKIE_FILE.exists():
        print("No cookies found — exporting from Chrome debug session...")
        export_cookies_from_chrome()

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 900},
        )

        load_cookies(context)
        page = context.new_page()

        all_posts = []
        trading_posts = []

        for account in ACCOUNTS:
            print(f"\nScraping @{account}...", end=" ", flush=True)
            posts = scrape_account(page, account)
            print(f"{len(posts)} posts")

            for post in posts:
                all_posts.append(post)
                if is_trading_related(post["text"]):
                    trading_posts.append(post)

        browser.close()

    print(f"\n{'='*50}")
    print(f"Total posts: {len(all_posts)}")
    print(f"Trading-related: {len(trading_posts)}")

    # Save results
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M")
    out_file = DATA_DIR / f"scan-{timestamp}.json"
    out_file.write_text(json.dumps({
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "total_posts": len(all_posts),
        "trading_posts": len(trading_posts),
        "posts": trading_posts,
    }, indent=2))
    print(f"Saved to {out_file}")

    # Check for new posts we haven't seen before
    seen_file = DATA_DIR / "seen_posts.json"
    seen = set()
    if seen_file.exists():
        try:
            seen = set(json.loads(seen_file.read_text()))
        except:
            pass

    new_posts = []
    for post in trading_posts:
        sig = post["text"][:150]
        if sig not in seen:
            new_posts.append(post)
            seen.add(sig)

    seen_file.write_text(json.dumps(list(seen)[-5000:]))  # Keep last 5000

    if new_posts:
        print(f"\n🔔 {len(new_posts)} NEW trading posts!")
        for post in new_posts[:5]:
            lines = post["text"].split("\n")
            author = f"@{post['account']}"
            snippet = post["text"][:200].replace("\n", " ")
            print(f"\n  {author}: {snippet}")

            # Alert on Telegram
            msg = f"🔍 <b>New from {author}</b>\n\n{snippet[:300]}"
            if post.get("urls"):
                x_urls = [u for u in post["urls"] if "x.com" in u or "twitter.com" in u]
                if x_urls:
                    msg += f"\n\n{x_urls[0]}"
            send_telegram(msg)
    else:
        print("\nNo new trading posts since last scan.")


if __name__ == "__main__":
    main()