Files
workspace/projects/feed-hunter/x_scraper_pw.py

250 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
X/Twitter Feed Scraper using Playwright
Scrapes specific accounts for trading-related posts.
Uses saved Chrome session cookies for authentication.
"""
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
ACCOUNTS = [
"browomo", "ArchiveExplorer", "noisyb0y1", "krajekis",
"Shelpid_WI3M", "polyaboretum", "0xashensoul",
]
TRADING_KEYWORDS = [
"polymarket", "trade", "profit", "wallet", "arbitrage", "signal",
"crypto", "bitcoin", "ethereum", "solana", "strategy", "edge",
"bet", "position", "stock", "market", "pnl", "alpha",
"$", "usdc", "defi", "token", "copy", "whale", "degen",
"short", "long", "bullish", "bearish", "portfolio",
]
DATA_DIR = Path(__file__).parent.parent / "data" / "x-feed"
DATA_DIR.mkdir(parents=True, exist_ok=True)
COOKIE_FILE = Path(__file__).parent / "x_cookies.json"
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "6443752046")
def send_telegram(message):
if not TELEGRAM_BOT_TOKEN:
print(f"[ALERT] {message}")
return
import urllib.request
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
data = json.dumps({"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "HTML"}).encode()
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
try:
urllib.request.urlopen(req, timeout=10)
except Exception as e:
print(f"Telegram error: {e}")
def save_cookies(context):
cookies = context.cookies()
COOKIE_FILE.write_text(json.dumps(cookies, indent=2))
print(f"Saved {len(cookies)} cookies")
def load_cookies(context):
if COOKIE_FILE.exists():
cookies = json.loads(COOKIE_FILE.read_text())
context.add_cookies(cookies)
print(f"Loaded {len(cookies)} cookies")
return True
return False
def export_cookies_from_chrome():
"""One-time: grab cookies from the running Chrome debug instance."""
import http.client, websocket as ws_mod
conn = http.client.HTTPConnection("localhost", 9222)
conn.request("GET", "/json")
tabs = json.loads(conn.getresponse().read())
x_tab = None
for t in tabs:
if "x.com" in t.get("url", ""):
x_tab = t
break
if not x_tab:
print("No X tab found in Chrome debug")
return []
ws = ws_mod.create_connection(x_tab["webSocketDebuggerUrl"], timeout=10)
ws.send(json.dumps({"id": 1, "method": "Network.getAllCookies"}))
result = json.loads(ws.recv())
all_cookies = result.get("result", {}).get("cookies", [])
ws.close()
# Filter for x.com cookies and convert to Playwright format
x_cookies = []
for c in all_cookies:
if "x.com" in c.get("domain", "") or "twitter.com" in c.get("domain", ""):
x_cookies.append({
"name": c["name"],
"value": c["value"],
"domain": c["domain"],
"path": c.get("path", "/"),
"secure": c.get("secure", False),
"httpOnly": c.get("httpOnly", False),
"sameSite": c.get("sameSite", "Lax"),
})
COOKIE_FILE.write_text(json.dumps(x_cookies, indent=2))
print(f"Exported {len(x_cookies)} X cookies from Chrome")
return x_cookies
def scrape_account(page, account, max_scroll=5):
"""Scrape recent posts from a single account."""
posts = []
try:
page.goto(f"https://x.com/{account}", wait_until="networkidle", timeout=15000)
except:
try:
page.goto(f"https://x.com/{account}", wait_until="domcontentloaded", timeout=10000)
page.wait_for_timeout(3000)
except Exception as e:
print(f" Failed to load @{account}: {e}")
return posts
seen_texts = set()
for scroll in range(max_scroll):
articles = page.query_selector_all("article")
for article in articles:
try:
text = article.inner_text()[:800]
# Deduplicate
sig = text[:100]
if sig in seen_texts:
continue
seen_texts.add(sig)
# Extract links
links = article.query_selector_all("a")
urls = [l.get_attribute("href") for l in links if l.get_attribute("href")]
posts.append({
"account": account,
"text": text,
"urls": urls[:5],
"scraped_at": datetime.now(timezone.utc).isoformat(),
})
except:
continue
# Scroll down
page.evaluate("window.scrollBy(0, 1500)")
page.wait_for_timeout(1500)
return posts
def is_trading_related(text):
text_lower = text.lower()
return any(kw in text_lower for kw in TRADING_KEYWORDS)
def main():
from playwright.sync_api import sync_playwright
print(f"=== X Feed Scraper (Playwright) ===")
print(f"Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
# Export cookies from Chrome if we don't have them yet
if not COOKIE_FILE.exists():
print("No cookies found — exporting from Chrome debug session...")
export_cookies_from_chrome()
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={"width": 1280, "height": 900},
)
load_cookies(context)
page = context.new_page()
all_posts = []
trading_posts = []
for account in ACCOUNTS:
print(f"\nScraping @{account}...", end=" ", flush=True)
posts = scrape_account(page, account)
print(f"{len(posts)} posts")
for post in posts:
all_posts.append(post)
if is_trading_related(post["text"]):
trading_posts.append(post)
browser.close()
print(f"\n{'='*50}")
print(f"Total posts: {len(all_posts)}")
print(f"Trading-related: {len(trading_posts)}")
# Save results
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M")
out_file = DATA_DIR / f"scan-{timestamp}.json"
out_file.write_text(json.dumps({
"timestamp": datetime.now(timezone.utc).isoformat(),
"total_posts": len(all_posts),
"trading_posts": len(trading_posts),
"posts": trading_posts,
}, indent=2))
print(f"Saved to {out_file}")
# Check for new posts we haven't seen before
seen_file = DATA_DIR / "seen_posts.json"
seen = set()
if seen_file.exists():
try:
seen = set(json.loads(seen_file.read_text()))
except:
pass
new_posts = []
for post in trading_posts:
sig = post["text"][:150]
if sig not in seen:
new_posts.append(post)
seen.add(sig)
seen_file.write_text(json.dumps(list(seen)[-5000:])) # Keep last 5000
if new_posts:
print(f"\n🔔 {len(new_posts)} NEW trading posts!")
for post in new_posts[:5]:
lines = post["text"].split("\n")
author = f"@{post['account']}"
snippet = post["text"][:200].replace("\n", " ")
print(f"\n {author}: {snippet}")
# Alert on Telegram
msg = f"🔍 <b>New from {author}</b>\n\n{snippet[:300]}"
if post.get("urls"):
x_urls = [u for u in post["urls"] if "x.com" in u or "twitter.com" in u]
if x_urls:
msg += f"\n\n{x_urls[0]}"
send_telegram(msg)
else:
print("\nNo new trading posts since last scan.")
if __name__ == "__main__":
main()