Add news-feed project - RSS/Atom feed reader - Supports multiple feeds (HN, Lobsters, etc.) - Article caching and read tracking - Digest generation - Works out of the box

2026-01-30 23:20:21 -06:00
parent b579251e90
commit 25c64a2b8e
5 changed files with 768 additions and 0 deletions
--- a/projects/news-feed/main.py
+++ b/projects/news-feed/main.py
@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""
+news-feed - RSS reader and news aggregator for staying informed
+
+A simple RSS/Atom feed reader that:
+- Fetches and parses feeds
+- Stores articles locally
+- Generates digests
+- Tracks read/unread status
+"""
+
+import os
+import json
+import hashlib
+import xml.etree.ElementTree as ET
+from datetime import datetime
+from pathlib import Path
+from urllib.request import urlopen, Request
+from urllib.error import URLError
+from html.parser import HTMLParser
+import sys
+
+PROJECT_DIR = Path(__file__).parent
+DATA_DIR = PROJECT_DIR / "data"
+FEEDS_FILE = DATA_DIR / "feeds.json"
+ARTICLES_FILE = DATA_DIR / "articles.json"
+
+# Default feeds to get started
+DEFAULT_FEEDS = [
+    {"name": "Hacker News", "url": "https://hnrss.org/frontpage", "category": "tech"},
+    {"name": "Lobsters", "url": "https://lobste.rs/rss", "category": "tech"},
+    {"name": "r/programming", "url": "https://www.reddit.com/r/programming/.rss", "category": "tech"},
+]
+
+class MLStripper(HTMLParser):
+    """Strip HTML tags from text."""
+    def __init__(self):
+        super().__init__()
+        self.text = []
+    def handle_data(self, d):
+        self.text.append(d)
+    def get_text(self):
+        return ''.join(self.text)
+
+def strip_html(html: str) -> str:
+    """Remove HTML tags from string."""
+    s = MLStripper()
+    try:
+        s.feed(html)
+        return s.get_text()
+    except:
+        return html
+
+def load_feeds() -> list:
+    """Load feed list."""
+    if FEEDS_FILE.exists():
+        with open(FEEDS_FILE) as f:
+            return json.load(f)
+    return DEFAULT_FEEDS
+
+def save_feeds(feeds: list):
+    """Save feed list."""
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    with open(FEEDS_FILE, 'w') as f:
+        json.dump(feeds, f, indent=2)
+
+def load_articles() -> dict:
+    """Load cached articles."""
+    if ARTICLES_FILE.exists():
+        with open(ARTICLES_FILE) as f:
+            return json.load(f)
+    return {}
+
+def save_articles(articles: dict):
+    """Save articles cache."""
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    with open(ARTICLES_FILE, 'w') as f:
+        json.dump(articles, f, indent=2)
+
+def fetch_feed(url: str) -> str:
+    """Fetch feed content."""
+    req = Request(url, headers={'User-Agent': 'news-feed/1.0'})
+    try:
+        with urlopen(req, timeout=10) as resp:
+            return resp.read().decode('utf-8', errors='ignore')
+    except URLError as e:
+        print(f"  Error fetching: {e}")
+        return None
+
+def parse_feed(content: str, feed_name: str) -> list:
+    """Parse RSS/Atom feed content."""
+    articles = []
+    try:
+        root = ET.fromstring(content)
+        
+        # Try RSS format
+        for item in root.findall('.//item'):
+            title = item.findtext('title', '')
+            link = item.findtext('link', '')
+            desc = item.findtext('description', '')
+            pub_date = item.findtext('pubDate', '')
+            
+            if title and link:
+                articles.append({
+                    'id': hashlib.md5(link.encode()).hexdigest()[:12],
+                    'title': title.strip(),
+                    'link': link.strip(),
+                    'description': strip_html(desc)[:300] if desc else '',
+                    'published': pub_date,
+                    'feed': feed_name,
+                    'fetched': datetime.now().isoformat(),
+                    'read': False,
+                })
+        
+        # Try Atom format if no items found
+        if not articles:
+            ns = {'atom': 'http://www.w3.org/2005/Atom'}
+            for entry in root.findall('.//atom:entry', ns) or root.findall('.//entry'):
+                title = entry.findtext('atom:title', '', ns) or entry.findtext('title', '')
+                link_elem = entry.find('atom:link', ns) or entry.find('link')
+                link = link_elem.get('href', '') if link_elem is not None else ''
+                summary = entry.findtext('atom:summary', '', ns) or entry.findtext('summary', '')
+                
+                if title and link:
+                    articles.append({
+                        'id': hashlib.md5(link.encode()).hexdigest()[:12],
+                        'title': title.strip(),
+                        'link': link.strip(),
+                        'description': strip_html(summary)[:300] if summary else '',
+                        'published': '',
+                        'feed': feed_name,
+                        'fetched': datetime.now().isoformat(),
+                        'read': False,
+                    })
+    except ET.ParseError as e:
+        print(f"  Parse error: {e}")
+    
+    return articles
+
+def refresh():
+    """Refresh all feeds."""
+    feeds = load_feeds()
+    articles = load_articles()
+    new_count = 0
+    
+    print(f"Refreshing {len(feeds)} feeds...\n")
+    
+    for feed in feeds:
+        print(f"  {feed['name']}...", end=' ', flush=True)
+        content = fetch_feed(feed['url'])
+        
+        if content:
+            items = parse_feed(content, feed['name'])
+            for item in items:
+                if item['id'] not in articles:
+                    articles[item['id']] = item
+                    new_count += 1
+            print(f"{len(items)} items")
+        else:
+            print("failed")
+    
+    save_articles(articles)
+    print(f"\n✓ {new_count} new articles")
+
+def list_articles(limit: int = 20, unread_only: bool = False, feed: str = None):
+    """List articles."""
+    articles = load_articles()
+    
+    items = list(articles.values())
+    
+    if unread_only:
+        items = [a for a in items if not a.get('read')]
+    
+    if feed:
+        items = [a for a in items if feed.lower() in a['feed'].lower()]
+    
+    # Sort by fetched date, newest first
+    items.sort(key=lambda x: x.get('fetched', ''), reverse=True)
+    items = items[:limit]
+    
+    if not items:
+        print("No articles found")
+        return
+    
+    print(f"\n📰 Articles ({len(items)} shown)\n")
+    
+    for item in items:
+        status = "  " if item.get('read') else "🔵"
+        title = item['title'][:60] + "..." if len(item['title']) > 60 else item['title']
+        print(f"{status} [{item['id']}] {title}")
+        print(f"   {item['feed']} | {item['link'][:50]}...")
+        print()
+
+def read_article(article_id: str):
+    """Mark article as read and show details."""
+    articles = load_articles()
+    
+    # Find by prefix
+    matches = [a for aid, a in articles.items() if aid.startswith(article_id)]
+    
+    if not matches:
+        print(f"Article not found: {article_id}")
+        return
+    
+    article = matches[0]
+    article['read'] = True
+    articles[article['id']] = article
+    save_articles(articles)
+    
+    print(f"\n📄 {article['title']}")
+    print(f"   Feed: {article['feed']}")
+    print(f"   Link: {article['link']}")
+    print()
+    if article.get('description'):
+        print(f"   {article['description']}")
+    print()
+
+def add_feed(url: str, name: str = None, category: str = "general"):
+    """Add a new feed."""
+    feeds = load_feeds()
+    
+    # Check if already exists
+    if any(f['url'] == url for f in feeds):
+        print("Feed already exists")
+        return
+    
+    feeds.append({
+        'name': name or url,
+        'url': url,
+        'category': category,
+    })
+    save_feeds(feeds)
+    print(f"✓ Added: {name or url}")
+
+def list_feeds():
+    """List configured feeds."""
+    feeds = load_feeds()
+    print(f"\n📡 Feeds ({len(feeds)})\n")
+    for feed in feeds:
+        print(f"  [{feed.get('category', 'general')}] {feed['name']}")
+        print(f"      {feed['url']}")
+        print()
+
+def digest():
+    """Generate a quick digest of unread articles."""
+    articles = load_articles()
+    unread = [a for a in articles.values() if not a.get('read')]
+    
+    # Group by feed
+    by_feed = {}
+    for a in unread:
+        feed = a['feed']
+        if feed not in by_feed:
+            by_feed[feed] = []
+        by_feed[feed].append(a)
+    
+    print(f"\n📰 News Digest - {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+    print(f"   {len(unread)} unread articles\n")
+    
+    for feed, items in sorted(by_feed.items()):
+        print(f"📌 {feed} ({len(items)})")
+        for item in items[:3]:
+            title = item['title'][:50] + "..." if len(item['title']) > 50 else item['title']
+            print(f"   • {title}")
+        if len(items) > 3:
+            print(f"   ... and {len(items) - 3} more")
+        print()
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage:")
+        print("  news-feed refresh           - Fetch new articles")
+        print("  news-feed list [--unread]   - List articles")
+        print("  news-feed read <id>         - Read an article")
+        print("  news-feed digest            - Quick digest")
+        print("  news-feed feeds             - List feeds")
+        print("  news-feed add <url> [name]  - Add a feed")
+        return
+    
+    cmd = sys.argv[1]
+    
+    if cmd == 'refresh':
+        refresh()
+    elif cmd == 'list':
+        unread = '--unread' in sys.argv
+        list_articles(unread_only=unread)
+    elif cmd == 'read' and len(sys.argv) > 2:
+        read_article(sys.argv[2])
+    elif cmd == 'digest':
+        digest()
+    elif cmd == 'feeds':
+        list_feeds()
+    elif cmd == 'add' and len(sys.argv) > 2:
+        url = sys.argv[2]
+        name = sys.argv[3] if len(sys.argv) > 3 else None
+        add_feed(url, name)
+    else:
+        print("Unknown command")
+
+if __name__ == "__main__":
+    main()