Add reddit-scanner project - Reddit search and topic scanning - Sentiment analysis from comments - No auth required (uses public JSON API) - Rate limiting built in

2026-01-30 23:25:22 -06:00
parent 5fc5ec58d2
commit 475ed541c9
4 changed files with 311 additions and 0 deletions
--- a/projects/reddit-scanner/.gitignore
+++ b/projects/reddit-scanner/.gitignore
@ -0,0 +1,7 @@
 venv/
 __pycache__/
 *.pyc
 .env
 *.egg-info/
 dist/
 build/
--- a/projects/reddit-scanner/README.md
+++ b/projects/reddit-scanner/README.md
@ -0,0 +1,17 @@
 # reddit-scanner
 Scan Reddit for sentiment and discussions on topics
 ## Setup
 ```bash
 python -m venv venv
 source venv/bin/activate
 pip install -r requirements.txt
 ```
 ## Usage
 ```bash
 python main.py
 ```
--- a/projects/reddit-scanner/main.py
+++ b/projects/reddit-scanner/main.py
@ -0,0 +1,286 @@
 #!/usr/bin/env python3
 """
 reddit-scanner - Scan Reddit for sentiment and discussions on topics
 Uses Reddit's public JSON API (no auth required, rate limited).
 """
 import json
 import time
 import re
 import sys
 from datetime import datetime
 from pathlib import Path
 from urllib.request import urlopen, Request
 from urllib.error import URLError, HTTPError
 from urllib.parse import quote_plus
 from collections import Counter
 PROJECT_DIR = Path(__file__).parent
 DATA_DIR = PROJECT_DIR / "data"
 CACHE_DIR = DATA_DIR / "cache"
 USER_AGENT = "reddit-scanner/1.0 (personal use)"
 RATE_LIMIT = 2  # seconds between requests
 last_request = 0
 def fetch_json(url: str) -> dict:
    """Fetch JSON from Reddit API with rate limiting."""
    global last_request
    # Rate limit
    elapsed = time.time() - last_request
    if elapsed < RATE_LIMIT:
        time.sleep(RATE_LIMIT - elapsed)
    req = Request(url, headers={'User-Agent': USER_AGENT})
    try:
        with urlopen(req, timeout=15) as resp:
            last_request = time.time()
            return json.loads(resp.read().decode('utf-8'))
    except HTTPError as e:
        if e.code == 429:
            print("  Rate limited, waiting...")
            time.sleep(10)
            return fetch_json(url)
        print(f"  HTTP Error: {e.code}")
        return None
    except URLError as e:
        print(f"  Error: {e}")
        return None
 def get_subreddit_posts(subreddit: str, sort: str = "hot", limit: int = 25) -> list:
    """Fetch posts from a subreddit."""
    url = f"https://www.reddit.com/r/{subreddit}/{sort}.json?limit={limit}"
    data = fetch_json(url)
    if not data or 'data' not in data:
        return []
    posts = []
    for child in data['data'].get('children', []):
        post = child['data']
        posts.append({
            'id': post['id'],
            'title': post['title'],
            'author': post['author'],
            'score': post['score'],
            'upvote_ratio': post.get('upvote_ratio', 0),
            'num_comments': post['num_comments'],
            'url': post['url'],
            'selftext': post.get('selftext', '')[:500],
            'created': datetime.fromtimestamp(post['created_utc']).isoformat(),
            'permalink': f"https://reddit.com{post['permalink']}",
            'subreddit': subreddit,
        })
    return posts
 def get_post_comments(subreddit: str, post_id: str, limit: int = 50) -> list:
    """Fetch comments from a post."""
    url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json?limit={limit}"
    data = fetch_json(url)
    if not data or len(data) < 2:
        return []
    comments = []
    def extract_comments(children, depth=0):
        for child in children:
            if child['kind'] != 't1':
                continue
            comment = child['data']
            comments.append({
                'id': comment['id'],
                'author': comment['author'],
                'body': comment['body'][:1000],
                'score': comment['score'],
                'depth': depth,
                'created': datetime.fromtimestamp(comment['created_utc']).isoformat(),
            })
            # Get replies
            if comment.get('replies') and isinstance(comment['replies'], dict):
                replies = comment['replies']['data'].get('children', [])
                extract_comments(replies, depth + 1)
    extract_comments(data[1]['data'].get('children', []))
    return comments
 def search_reddit(query: str, subreddit: str = None, sort: str = "relevance", limit: int = 25) -> list:
    """Search Reddit for a query."""
    encoded_query = quote_plus(query)
    if subreddit:
        url = f"https://www.reddit.com/r/{subreddit}/search.json?q={encoded_query}&restrict_sr=1&sort={sort}&limit={limit}"
    else:
        url = f"https://www.reddit.com/search.json?q={encoded_query}&sort={sort}&limit={limit}"
    data = fetch_json(url)
    if not data or 'data' not in data:
        return []
    posts = []
    for child in data['data'].get('children', []):
        post = child['data']
        posts.append({
            'id': post['id'],
            'title': post['title'],
            'author': post['author'],
            'score': post['score'],
            'num_comments': post['num_comments'],
            'subreddit': post['subreddit'],
            'permalink': f"https://reddit.com{post['permalink']}",
            'created': datetime.fromtimestamp(post['created_utc']).isoformat(),
        })
    return posts
 def analyze_sentiment(texts: list) -> dict:
    """Simple keyword-based sentiment analysis."""
    positive_words = {'good', 'great', 'awesome', 'love', 'best', 'amazing', 'excellent', 
                     'fantastic', 'wonderful', 'perfect', 'nice', 'thanks', 'helpful',
                     'brilliant', 'impressive', 'excited', 'happy', 'beautiful'}
    negative_words = {'bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'sucks',
                     'disappointing', 'frustrated', 'annoying', 'broken', 'useless',
                     'stupid', 'boring', 'ugly', 'failed', 'waste', 'problem', 'issue'}
    positive = 0
    negative = 0
    neutral = 0
    word_freq = Counter()
    for text in texts:
        words = set(re.findall(r'\b\w+\b', text.lower()))
        word_freq.update(words)
        pos_count = len(words & positive_words)
        neg_count = len(words & negative_words)
        if pos_count > neg_count:
            positive += 1
        elif neg_count > pos_count:
            negative += 1
        else:
            neutral += 1
    total = positive + negative + neutral
    return {
        'positive': positive,
        'negative': negative,
        'neutral': neutral,
        'total': total,
        'positive_pct': round(positive / total * 100, 1) if total else 0,
        'negative_pct': round(negative / total * 100, 1) if total else 0,
        'top_words': word_freq.most_common(20),
    }
 def scan_topic(topic: str, subreddits: list = None):
    """Scan for a topic across subreddits."""
    print(f"\n🔍 Scanning Reddit for: {topic}")
    print("=" * 50)
    if not subreddits:
        subreddits = ['all']
    all_posts = []
    all_comments = []
    for sub in subreddits:
        print(f"\n📌 r/{sub}")
        posts = search_reddit(topic, sub if sub != 'all' else None, limit=10)
        print(f"   Found {len(posts)} posts")
        for post in posts[:3]:  # Get comments from top 3
            print(f"   Fetching comments from: {post['title'][:40]}...")
            comments = get_post_comments(post['subreddit'], post['id'], limit=30)
            all_comments.extend([c['body'] for c in comments])
        all_posts.extend(posts)
    # Analyze
    print(f"\n📊 Analysis")
    print("-" * 30)
    if all_posts:
        avg_score = sum(p['score'] for p in all_posts) / len(all_posts)
        avg_comments = sum(p['num_comments'] for p in all_posts) / len(all_posts)
        print(f"Posts analyzed: {len(all_posts)}")
        print(f"Avg score: {avg_score:.0f}")
        print(f"Avg comments: {avg_comments:.0f}")
    if all_comments:
        sentiment = analyze_sentiment(all_comments)
        print(f"\nComment sentiment ({sentiment['total']} comments):")
        print(f"  👍 Positive: {sentiment['positive_pct']}%")
        print(f"  👎 Negative: {sentiment['negative_pct']}%")
        print(f"  😐 Neutral: {100 - sentiment['positive_pct'] - sentiment['negative_pct']:.1f}%")
        print(f"\nTop words:")
        for word, count in sentiment['top_words'][:10]:
            if len(word) > 3:
                print(f"  {word}: {count}")
    # Top posts
    print(f"\n🔥 Top Posts")
    print("-" * 30)
    for post in sorted(all_posts, key=lambda x: x['score'], reverse=True)[:5]:
        print(f"[{post['score']:4}] {post['title'][:50]}...")
        print(f"       r/{post['subreddit']} | {post['num_comments']} comments")
        print()
 def show_subreddit(subreddit: str, sort: str = "hot"):
    """Show posts from a subreddit."""
    print(f"\n📌 r/{subreddit} ({sort})")
    print("=" * 50)
    posts = get_subreddit_posts(subreddit, sort, limit=15)
    if not posts:
        print("No posts found or subreddit doesn't exist")
        return
    for post in posts:
        print(f"\n[{post['score']:4}] {post['title'][:60]}")
        print(f"       {post['num_comments']} comments | u/{post['author']}")
 def main():
    if len(sys.argv) < 2:
        print("Usage:")
        print("  reddit-scanner topic <query> [subreddits...]")
        print("  reddit-scanner sub <subreddit> [hot|new|top]")
        print("  reddit-scanner search <query>")
        print("")
        print("Examples:")
        print("  reddit-scanner topic 'artificial intelligence' technology futurology")
        print("  reddit-scanner sub python hot")
        print("  reddit-scanner search 'rust programming'")
        return
    cmd = sys.argv[1]
    if cmd == 'topic' and len(sys.argv) > 2:
        topic = sys.argv[2]
        subreddits = sys.argv[3:] if len(sys.argv) > 3 else None
        scan_topic(topic, subreddits)
    elif cmd == 'sub' and len(sys.argv) > 2:
        subreddit = sys.argv[2]
        sort = sys.argv[3] if len(sys.argv) > 3 else 'hot'
        show_subreddit(subreddit, sort)
    elif cmd == 'search' and len(sys.argv) > 2:
        query = ' '.join(sys.argv[2:])
        posts = search_reddit(query, limit=15)
        print(f"\n🔍 Search: {query}")
        print("=" * 50)
        for post in posts:
            print(f"\n[{post['score']:4}] {post['title'][:60]}")
            print(f"       r/{post['subreddit']} | {post['num_comments']} comments")
    else:
        print("Unknown command. Run without args for help.")
 if __name__ == "__main__":
    main()
--- a/projects/reddit-scanner/requirements.txt
+++ b/projects/reddit-scanner/requirements.txt
@ -0,0 +1 @@
 # Add dependencies here