250 lines
8.0 KiB
Python
250 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
X/Twitter Feed Scraper using Playwright
|
|
Scrapes specific accounts for trading-related posts.
|
|
Uses saved Chrome session cookies for authentication.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
ACCOUNTS = [
|
|
"browomo", "ArchiveExplorer", "noisyb0y1", "krajekis",
|
|
"Shelpid_WI3M", "polyaboretum", "0xashensoul",
|
|
]
|
|
|
|
TRADING_KEYWORDS = [
|
|
"polymarket", "trade", "profit", "wallet", "arbitrage", "signal",
|
|
"crypto", "bitcoin", "ethereum", "solana", "strategy", "edge",
|
|
"bet", "position", "stock", "market", "pnl", "alpha",
|
|
"$", "usdc", "defi", "token", "copy", "whale", "degen",
|
|
"short", "long", "bullish", "bearish", "portfolio",
|
|
]
|
|
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "x-feed"
|
|
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
COOKIE_FILE = Path(__file__).parent / "x_cookies.json"
|
|
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
|
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "6443752046")
|
|
|
|
|
|
def send_telegram(message):
|
|
if not TELEGRAM_BOT_TOKEN:
|
|
print(f"[ALERT] {message}")
|
|
return
|
|
import urllib.request
|
|
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
|
data = json.dumps({"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "HTML"}).encode()
|
|
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
|
try:
|
|
urllib.request.urlopen(req, timeout=10)
|
|
except Exception as e:
|
|
print(f"Telegram error: {e}")
|
|
|
|
|
|
def save_cookies(context):
|
|
cookies = context.cookies()
|
|
COOKIE_FILE.write_text(json.dumps(cookies, indent=2))
|
|
print(f"Saved {len(cookies)} cookies")
|
|
|
|
|
|
def load_cookies(context):
|
|
if COOKIE_FILE.exists():
|
|
cookies = json.loads(COOKIE_FILE.read_text())
|
|
context.add_cookies(cookies)
|
|
print(f"Loaded {len(cookies)} cookies")
|
|
return True
|
|
return False
|
|
|
|
|
|
def export_cookies_from_chrome():
|
|
"""One-time: grab cookies from the running Chrome debug instance."""
|
|
import http.client, websocket as ws_mod
|
|
conn = http.client.HTTPConnection("localhost", 9222)
|
|
conn.request("GET", "/json")
|
|
tabs = json.loads(conn.getresponse().read())
|
|
|
|
x_tab = None
|
|
for t in tabs:
|
|
if "x.com" in t.get("url", ""):
|
|
x_tab = t
|
|
break
|
|
|
|
if not x_tab:
|
|
print("No X tab found in Chrome debug")
|
|
return []
|
|
|
|
ws = ws_mod.create_connection(x_tab["webSocketDebuggerUrl"], timeout=10)
|
|
ws.send(json.dumps({"id": 1, "method": "Network.getAllCookies"}))
|
|
result = json.loads(ws.recv())
|
|
all_cookies = result.get("result", {}).get("cookies", [])
|
|
ws.close()
|
|
|
|
# Filter for x.com cookies and convert to Playwright format
|
|
x_cookies = []
|
|
for c in all_cookies:
|
|
if "x.com" in c.get("domain", "") or "twitter.com" in c.get("domain", ""):
|
|
x_cookies.append({
|
|
"name": c["name"],
|
|
"value": c["value"],
|
|
"domain": c["domain"],
|
|
"path": c.get("path", "/"),
|
|
"secure": c.get("secure", False),
|
|
"httpOnly": c.get("httpOnly", False),
|
|
"sameSite": c.get("sameSite", "Lax"),
|
|
})
|
|
|
|
COOKIE_FILE.write_text(json.dumps(x_cookies, indent=2))
|
|
print(f"Exported {len(x_cookies)} X cookies from Chrome")
|
|
return x_cookies
|
|
|
|
|
|
def scrape_account(page, account, max_scroll=5):
|
|
"""Scrape recent posts from a single account."""
|
|
posts = []
|
|
|
|
try:
|
|
page.goto(f"https://x.com/{account}", wait_until="networkidle", timeout=15000)
|
|
except:
|
|
try:
|
|
page.goto(f"https://x.com/{account}", wait_until="domcontentloaded", timeout=10000)
|
|
page.wait_for_timeout(3000)
|
|
except Exception as e:
|
|
print(f" Failed to load @{account}: {e}")
|
|
return posts
|
|
|
|
seen_texts = set()
|
|
|
|
for scroll in range(max_scroll):
|
|
articles = page.query_selector_all("article")
|
|
|
|
for article in articles:
|
|
try:
|
|
text = article.inner_text()[:800]
|
|
# Deduplicate
|
|
sig = text[:100]
|
|
if sig in seen_texts:
|
|
continue
|
|
seen_texts.add(sig)
|
|
|
|
# Extract links
|
|
links = article.query_selector_all("a")
|
|
urls = [l.get_attribute("href") for l in links if l.get_attribute("href")]
|
|
|
|
posts.append({
|
|
"account": account,
|
|
"text": text,
|
|
"urls": urls[:5],
|
|
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
})
|
|
except:
|
|
continue
|
|
|
|
# Scroll down
|
|
page.evaluate("window.scrollBy(0, 1500)")
|
|
page.wait_for_timeout(1500)
|
|
|
|
return posts
|
|
|
|
|
|
def is_trading_related(text):
|
|
text_lower = text.lower()
|
|
return any(kw in text_lower for kw in TRADING_KEYWORDS)
|
|
|
|
|
|
def main():
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
print(f"=== X Feed Scraper (Playwright) ===")
|
|
print(f"Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
|
|
|
# Export cookies from Chrome if we don't have them yet
|
|
if not COOKIE_FILE.exists():
|
|
print("No cookies found — exporting from Chrome debug session...")
|
|
export_cookies_from_chrome()
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
viewport={"width": 1280, "height": 900},
|
|
)
|
|
|
|
load_cookies(context)
|
|
page = context.new_page()
|
|
|
|
all_posts = []
|
|
trading_posts = []
|
|
|
|
for account in ACCOUNTS:
|
|
print(f"\nScraping @{account}...", end=" ", flush=True)
|
|
posts = scrape_account(page, account)
|
|
print(f"{len(posts)} posts")
|
|
|
|
for post in posts:
|
|
all_posts.append(post)
|
|
if is_trading_related(post["text"]):
|
|
trading_posts.append(post)
|
|
|
|
browser.close()
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Total posts: {len(all_posts)}")
|
|
print(f"Trading-related: {len(trading_posts)}")
|
|
|
|
# Save results
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M")
|
|
out_file = DATA_DIR / f"scan-{timestamp}.json"
|
|
out_file.write_text(json.dumps({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"total_posts": len(all_posts),
|
|
"trading_posts": len(trading_posts),
|
|
"posts": trading_posts,
|
|
}, indent=2))
|
|
print(f"Saved to {out_file}")
|
|
|
|
# Check for new posts we haven't seen before
|
|
seen_file = DATA_DIR / "seen_posts.json"
|
|
seen = set()
|
|
if seen_file.exists():
|
|
try:
|
|
seen = set(json.loads(seen_file.read_text()))
|
|
except:
|
|
pass
|
|
|
|
new_posts = []
|
|
for post in trading_posts:
|
|
sig = post["text"][:150]
|
|
if sig not in seen:
|
|
new_posts.append(post)
|
|
seen.add(sig)
|
|
|
|
seen_file.write_text(json.dumps(list(seen)[-5000:])) # Keep last 5000
|
|
|
|
if new_posts:
|
|
print(f"\n🔔 {len(new_posts)} NEW trading posts!")
|
|
for post in new_posts[:5]:
|
|
lines = post["text"].split("\n")
|
|
author = f"@{post['account']}"
|
|
snippet = post["text"][:200].replace("\n", " ")
|
|
print(f"\n {author}: {snippet}")
|
|
|
|
# Alert on Telegram
|
|
msg = f"🔍 <b>New from {author}</b>\n\n{snippet[:300]}"
|
|
if post.get("urls"):
|
|
x_urls = [u for u in post["urls"] if "x.com" in u or "twitter.com" in u]
|
|
if x_urls:
|
|
msg += f"\n\n{x_urls[0]}"
|
|
send_telegram(msg)
|
|
else:
|
|
print("\nNo new trading posts since last scan.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|