diff --git a/scripts/ewp-generate-article-digest b/scripts/ewp-generate-article-digest deleted file mode 100755 index 8507098..0000000 --- a/scripts/ewp-generate-article-digest +++ /dev/null @@ -1,173 +0,0 @@ -#! /usr/bin/env python3 -import argparse -import sys -import logging -import requests -import feedparser -import json -import sqlite3 -import subprocess -import smtplib -from datetime import datetime -from pathlib import Path - -logger = logging.getLogger(__name__) - - -def get_all_articles_for_subscription(subscription: dict) -> list: - """ - Get all articles for a given subscription - """ - - # Find all the feeds - feeds = subscription.get("feeds", []) - if feeds != 1: - logger.info(f"Found {len(feeds)} feeds for subscription {subscription['name']}") - - # Download all the feeds - articles = [] - for feed in feeds: - logger.info(f"Downloading feed {feed}") - - # Make a request. Careful to handle failures - try: - response = requests.get(feed, timeout=3.0) - response.raise_for_status() - except requests.exceptions.RequestException as e: - logger.warning(f"Failed to download feed {feed}: {e}") - continue - - # Parse the feed - feed = feedparser.parse(response.text) - logger.info( - f"Found {len(feed.entries)} articles in feed {subscription['name']} ({feed.feed.title})" - ) - - # Add the articles to the list - articles.extend(feed.entries) - - return articles - - -def main() -> int: - # Handle program arguments - ap = argparse.ArgumentParser( - prog="ewp-generate-article-digest", - description="Generates a digest of new articles", - ) - ap.add_argument( - "--subscriptions", - help="Path to the subscriptions file", - type=Path, - default=Path( - "~/.config/ewconfig/configs/ewp-rss-feeds/subscriptions.json" - ).expanduser(), - ) - ap.add_argument( - "--cache-file", - help="Path to the cache file", - type=Path, - default=Path("~/.cache/ewp-rss-feeds.sqlite3").expanduser(), - ) - ap.add_argument( - "-v", "--verbose", help="Enable verbose logging", action="store_true" - ) - args = ap.parse_args() - - # Configure logging - logging.basicConfig( - level=logging.DEBUG if args.verbose else logging.INFO, - format="%(levelname)s: %(message)s", - ) - - # Load the subscriptions file - if not args.subscriptions.exists(): - logger.error(f"Subscriptions file {args.subscriptions} does not exist") - return 1 - subscriptions = json.loads(args.subscriptions.read_text()) - logger.info(f"Found {len(subscriptions)} subscriptions") - - # Set up the cache - args.cache_file.parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Using cache file {args.cache_file}") - cache_db = sqlite3.connect(args.cache_file) - cache_db.execute( - """ - CREATE TABLE IF NOT EXISTS articles ( - id INTEGER PRIMARY KEY, - url TEXT, - date_fetched TEXT - ) - """ - ) - - # Create an output buffer - output = {} - - # Handle each subscription - for subscription in subscriptions: - logger.info(f"Processing subscription {subscription['name']}") - articles = get_all_articles_for_subscription(subscription) - - # Process each article - for article in articles: - - # If we need special pre-processing - if "lwn::hide-paid-articles" in subscription.get("filters", []): - if article.get("title", "").startswith("[$]"): - logger.info(f"Skipping paid article {article.title}") - continue - - # Determine the article URL - url = article.get("link") or article.get("guid") or None - if url is None: - logger.warning(f"Skipping article with no URL: {article.title}") - continue - - # Check if the article is already in the cache - cursor = cache_db.execute( - "SELECT id FROM articles WHERE url = ?", (url,) - ) - if cursor.fetchone() is not None: - logger.debug(f"Skipping article {article.title} (already in cache)") - continue - - # Add the article to the output and cache it - if subscription['name'] not in output: - output[subscription['name']] = [] - - output[subscription['name']].append({ - "title": article.get("title"), - "link": url, - }) - - cache_db.execute( - "INSERT INTO articles (url, date_fetched) VALUES (?, datetime('now'))", - (url,), - ) - - # Sort the output by subscription name alphabetically (A first) - output = dict(sorted(output.items(), key=lambda x: x[0].lower())) - - # Build the output - output_str = "" - for subscription, articles in output.items(): - logger.debug(f"Building output for {subscription} ({len(articles)} articles)") - output_str += f">> {subscription}\n" - for article in articles: - output_str +=f" - {article['title']}\n" - output_str +=f" URL: {article['link']}\n" - output_str += "\n" - - # Print the output - print(output_str) - - # Clean up - cache_db.commit() - cache_db.close() - - return 0 - - -if __name__ == "__main__": - sys.exit(main())