Playwright X scraper, daily notes, feed analysis
This commit is contained in:
249
projects/feed-hunter/x_scraper_pw.py
Normal file
249
projects/feed-hunter/x_scraper_pw.py
Normal file
@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
X/Twitter Feed Scraper using Playwright
|
||||
Scrapes specific accounts for trading-related posts.
|
||||
Uses saved Chrome session cookies for authentication.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
ACCOUNTS = [
|
||||
"browomo", "ArchiveExplorer", "noisyb0y1", "krajekis",
|
||||
"Shelpid_WI3M", "polyaboretum", "0xashensoul",
|
||||
]
|
||||
|
||||
TRADING_KEYWORDS = [
|
||||
"polymarket", "trade", "profit", "wallet", "arbitrage", "signal",
|
||||
"crypto", "bitcoin", "ethereum", "solana", "strategy", "edge",
|
||||
"bet", "position", "stock", "market", "pnl", "alpha",
|
||||
"$", "usdc", "defi", "token", "copy", "whale", "degen",
|
||||
"short", "long", "bullish", "bearish", "portfolio",
|
||||
]
|
||||
|
||||
DATA_DIR = Path(__file__).parent.parent / "data" / "x-feed"
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
COOKIE_FILE = Path(__file__).parent / "x_cookies.json"
|
||||
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
||||
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "6443752046")
|
||||
|
||||
|
||||
def send_telegram(message):
|
||||
if not TELEGRAM_BOT_TOKEN:
|
||||
print(f"[ALERT] {message}")
|
||||
return
|
||||
import urllib.request
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
data = json.dumps({"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "HTML"}).encode()
|
||||
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10)
|
||||
except Exception as e:
|
||||
print(f"Telegram error: {e}")
|
||||
|
||||
|
||||
def save_cookies(context):
|
||||
cookies = context.cookies()
|
||||
COOKIE_FILE.write_text(json.dumps(cookies, indent=2))
|
||||
print(f"Saved {len(cookies)} cookies")
|
||||
|
||||
|
||||
def load_cookies(context):
|
||||
if COOKIE_FILE.exists():
|
||||
cookies = json.loads(COOKIE_FILE.read_text())
|
||||
context.add_cookies(cookies)
|
||||
print(f"Loaded {len(cookies)} cookies")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def export_cookies_from_chrome():
|
||||
"""One-time: grab cookies from the running Chrome debug instance."""
|
||||
import http.client, websocket as ws_mod
|
||||
conn = http.client.HTTPConnection("localhost", 9222)
|
||||
conn.request("GET", "/json")
|
||||
tabs = json.loads(conn.getresponse().read())
|
||||
|
||||
x_tab = None
|
||||
for t in tabs:
|
||||
if "x.com" in t.get("url", ""):
|
||||
x_tab = t
|
||||
break
|
||||
|
||||
if not x_tab:
|
||||
print("No X tab found in Chrome debug")
|
||||
return []
|
||||
|
||||
ws = ws_mod.create_connection(x_tab["webSocketDebuggerUrl"], timeout=10)
|
||||
ws.send(json.dumps({"id": 1, "method": "Network.getAllCookies"}))
|
||||
result = json.loads(ws.recv())
|
||||
all_cookies = result.get("result", {}).get("cookies", [])
|
||||
ws.close()
|
||||
|
||||
# Filter for x.com cookies and convert to Playwright format
|
||||
x_cookies = []
|
||||
for c in all_cookies:
|
||||
if "x.com" in c.get("domain", "") or "twitter.com" in c.get("domain", ""):
|
||||
x_cookies.append({
|
||||
"name": c["name"],
|
||||
"value": c["value"],
|
||||
"domain": c["domain"],
|
||||
"path": c.get("path", "/"),
|
||||
"secure": c.get("secure", False),
|
||||
"httpOnly": c.get("httpOnly", False),
|
||||
"sameSite": c.get("sameSite", "Lax"),
|
||||
})
|
||||
|
||||
COOKIE_FILE.write_text(json.dumps(x_cookies, indent=2))
|
||||
print(f"Exported {len(x_cookies)} X cookies from Chrome")
|
||||
return x_cookies
|
||||
|
||||
|
||||
def scrape_account(page, account, max_scroll=5):
|
||||
"""Scrape recent posts from a single account."""
|
||||
posts = []
|
||||
|
||||
try:
|
||||
page.goto(f"https://x.com/{account}", wait_until="networkidle", timeout=15000)
|
||||
except:
|
||||
try:
|
||||
page.goto(f"https://x.com/{account}", wait_until="domcontentloaded", timeout=10000)
|
||||
page.wait_for_timeout(3000)
|
||||
except Exception as e:
|
||||
print(f" Failed to load @{account}: {e}")
|
||||
return posts
|
||||
|
||||
seen_texts = set()
|
||||
|
||||
for scroll in range(max_scroll):
|
||||
articles = page.query_selector_all("article")
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
text = article.inner_text()[:800]
|
||||
# Deduplicate
|
||||
sig = text[:100]
|
||||
if sig in seen_texts:
|
||||
continue
|
||||
seen_texts.add(sig)
|
||||
|
||||
# Extract links
|
||||
links = article.query_selector_all("a")
|
||||
urls = [l.get_attribute("href") for l in links if l.get_attribute("href")]
|
||||
|
||||
posts.append({
|
||||
"account": account,
|
||||
"text": text,
|
||||
"urls": urls[:5],
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
})
|
||||
except:
|
||||
continue
|
||||
|
||||
# Scroll down
|
||||
page.evaluate("window.scrollBy(0, 1500)")
|
||||
page.wait_for_timeout(1500)
|
||||
|
||||
return posts
|
||||
|
||||
|
||||
def is_trading_related(text):
|
||||
text_lower = text.lower()
|
||||
return any(kw in text_lower for kw in TRADING_KEYWORDS)
|
||||
|
||||
|
||||
def main():
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
print(f"=== X Feed Scraper (Playwright) ===")
|
||||
print(f"Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
||||
|
||||
# Export cookies from Chrome if we don't have them yet
|
||||
if not COOKIE_FILE.exists():
|
||||
print("No cookies found — exporting from Chrome debug session...")
|
||||
export_cookies_from_chrome()
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1280, "height": 900},
|
||||
)
|
||||
|
||||
load_cookies(context)
|
||||
page = context.new_page()
|
||||
|
||||
all_posts = []
|
||||
trading_posts = []
|
||||
|
||||
for account in ACCOUNTS:
|
||||
print(f"\nScraping @{account}...", end=" ", flush=True)
|
||||
posts = scrape_account(page, account)
|
||||
print(f"{len(posts)} posts")
|
||||
|
||||
for post in posts:
|
||||
all_posts.append(post)
|
||||
if is_trading_related(post["text"]):
|
||||
trading_posts.append(post)
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Total posts: {len(all_posts)}")
|
||||
print(f"Trading-related: {len(trading_posts)}")
|
||||
|
||||
# Save results
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M")
|
||||
out_file = DATA_DIR / f"scan-{timestamp}.json"
|
||||
out_file.write_text(json.dumps({
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"total_posts": len(all_posts),
|
||||
"trading_posts": len(trading_posts),
|
||||
"posts": trading_posts,
|
||||
}, indent=2))
|
||||
print(f"Saved to {out_file}")
|
||||
|
||||
# Check for new posts we haven't seen before
|
||||
seen_file = DATA_DIR / "seen_posts.json"
|
||||
seen = set()
|
||||
if seen_file.exists():
|
||||
try:
|
||||
seen = set(json.loads(seen_file.read_text()))
|
||||
except:
|
||||
pass
|
||||
|
||||
new_posts = []
|
||||
for post in trading_posts:
|
||||
sig = post["text"][:150]
|
||||
if sig not in seen:
|
||||
new_posts.append(post)
|
||||
seen.add(sig)
|
||||
|
||||
seen_file.write_text(json.dumps(list(seen)[-5000:])) # Keep last 5000
|
||||
|
||||
if new_posts:
|
||||
print(f"\n🔔 {len(new_posts)} NEW trading posts!")
|
||||
for post in new_posts[:5]:
|
||||
lines = post["text"].split("\n")
|
||||
author = f"@{post['account']}"
|
||||
snippet = post["text"][:200].replace("\n", " ")
|
||||
print(f"\n {author}: {snippet}")
|
||||
|
||||
# Alert on Telegram
|
||||
msg = f"🔍 <b>New from {author}</b>\n\n{snippet[:300]}"
|
||||
if post.get("urls"):
|
||||
x_urls = [u for u in post["urls"] if "x.com" in u or "twitter.com" in u]
|
||||
if x_urls:
|
||||
msg += f"\n\n{x_urls[0]}"
|
||||
send_telegram(msg)
|
||||
else:
|
||||
print("\nNo new trading posts since last scan.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user