Basic digest generation

2024-05-10 12:51:26 -04:00 · 2024-05-10 12:51:26 -04:00 · 28ff06bb15
commit 28ff06bb15
parent 7d926df7e8
2 changed files with 170 additions and 3 deletions
--- a/configs/ewp-rss-feeds/subscriptions.json
+++ b/configs/ewp-rss-feeds/subscriptions.json
@ -243,9 +243,6 @@
        "name": "Discord Engineering & Developers",
        "feeds": [
            "https://discord.com/blog/rss.xml"
-        ],
-        "filters": [
-            "discord::category::engineering"
        ]
    },
    {
--- a/scripts/ewp-generate-article-digest
+++ b/scripts/ewp-generate-article-digest
@ -0,0 +1,170 @@
+#! /usr/bin/env python3
+import argparse
+import sys
+import logging
+import requests
+import feedparser
+import json
+import sqlite3
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def get_all_articles_for_subscription(subscription: dict) -> list:
+    """
+    Get all articles for a given subscription
+    """
+
+    # Find all the feeds
+    feeds = subscription.get("feeds", [])
+    if feeds != 1:
+        logger.info(f"Found {len(feeds)} feeds for subscription {subscription['name']}")
+
+    # Download all the feeds
+    articles = []
+    for feed in feeds:
+        logger.info(f"Downloading feed {feed}")
+
+        # Make a request. Careful to handle failures
+        try:
+            response = requests.get(feed, timeout=3.0)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Failed to download feed {feed}: {e}")
+            continue
+
+        # Parse the feed
+        feed = feedparser.parse(response.text)
+        logger.info(
+            f"Found {len(feed.entries)} articles in feed {subscription['name']} ({feed.feed.title})"
+        )
+
+        # Add the articles to the list
+        articles.extend(feed.entries)
+
+    return articles
+
+
+def main() -> int:
+    # Handle program arguments
+    ap = argparse.ArgumentParser(
+        prog="ewp-generate-article-digest",
+        description="Generates a digest of new articles",
+    )
+    ap.add_argument(
+        "--subscriptions",
+        help="Path to the subscriptions file",
+        type=Path,
+        default=Path(
+            "~/.config/ewconfig/configs/ewp-rss-feeds/subscriptions.json"
+        ).expanduser(),
+    )
+    ap.add_argument(
+        "--cache-file",
+        help="Path to the cache file",
+        type=Path,
+        default=Path("~/.cache/ewp-rss-feeds.sqlite3").expanduser(),
+    )
+    ap.add_argument(
+        "-v", "--verbose", help="Enable verbose logging", action="store_true"
+    )
+    args = ap.parse_args()
+
+    # Configure logging
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(levelname)s:	%(message)s",
+    )
+
+    # Load the subscriptions file
+    if not args.subscriptions.exists():
+        logger.error(f"Subscriptions file {args.subscriptions} does not exist")
+        return 1
+    subscriptions = json.loads(args.subscriptions.read_text())
+    logger.info(f"Found {len(subscriptions)} subscriptions")
+    
+    # Set up the cache
+    args.cache_file.parent.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Using cache file {args.cache_file}")
+    cache_db = sqlite3.connect(args.cache_file)
+    cache_db.execute(
+        """
+        CREATE TABLE IF NOT EXISTS articles (
+            id INTEGER PRIMARY KEY,
+            url TEXT,
+            date_fetched TEXT
+        )
+        """
+    )
+
+    # Create an output buffer
+    output = {}
+
+    # Handle each subscription
+    for subscription in subscriptions:
+        logger.info(f"Processing subscription {subscription['name']}")
+        articles = get_all_articles_for_subscription(subscription)
+
+        # Process each article
+        for article in articles:
+
+            # If we need special pre-processing
+            if "lwn::hide-paid-articles" in subscription.get("filters", []):
+                if article.get("title", "").startswith("[$]"):
+                    logger.info(f"Skipping paid article {article.title}")
+                    continue
+                
+            # Determine the article URL
+            url = article.get("link") or article.get("guid") or None
+            if url is None:
+                logger.warning(f"Skipping article with no URL: {article.title}")
+                continue
+            
+            # Check if the article is already in the cache
+            cursor = cache_db.execute(
+                "SELECT id FROM articles WHERE url = ?", (url,)
+            )
+            if cursor.fetchone() is not None:
+                logger.debug(f"Skipping article {article.title} (already in cache)")
+                continue
+            
+            # Add the article to the output and cache it
+            if subscription['name'] not in output:
+                output[subscription['name']] = []
+            
+            output[subscription['name']].append({
+                "title": article.get("title"),
+                "link": url,
+            })
+                
+            cache_db.execute(
+                "INSERT INTO articles (url, date_fetched) VALUES (?, datetime('now'))",
+                (url,),
+            )
+            
+    # Sort the output by subscription name alphabetically (A first)
+    output = dict(sorted(output.items(), key=lambda x: x[0].lower()))
+    
+    # Build the output
+    output_str = ""
+    for subscription, articles in output.items():
+        logger.debug(f"Building output for {subscription} ({len(articles)} articles)")
+        output_str += f">> {subscription}\n"
+        for article in articles:
+            output_str +=f" - {article['title']}\n"
+            output_str +=f"   URL: {article['link']}\n"
+        output_str += "\n"
+        
+    # Print the output
+    print(output_str)
+    
+    # Clean up
+    cache_db.commit()
+    cache_db.close()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())