1

Basic digest generation

This commit is contained in:
Evan Pratten 2024-05-10 12:51:26 -04:00
parent 7d926df7e8
commit 28ff06bb15
2 changed files with 170 additions and 3 deletions

View File

@ -243,9 +243,6 @@
"name": "Discord Engineering & Developers",
"feeds": [
"https://discord.com/blog/rss.xml"
],
"filters": [
"discord::category::engineering"
]
},
{

View File

@ -0,0 +1,170 @@
#! /usr/bin/env python3
import argparse
import sys
import logging
import requests
import feedparser
import json
import sqlite3
from pathlib import Path
logger = logging.getLogger(__name__)
def get_all_articles_for_subscription(subscription: dict) -> list:
"""
Get all articles for a given subscription
"""
# Find all the feeds
feeds = subscription.get("feeds", [])
if feeds != 1:
logger.info(f"Found {len(feeds)} feeds for subscription {subscription['name']}")
# Download all the feeds
articles = []
for feed in feeds:
logger.info(f"Downloading feed {feed}")
# Make a request. Careful to handle failures
try:
response = requests.get(feed, timeout=3.0)
response.raise_for_status()
except requests.exceptions.RequestException as e:
logger.warning(f"Failed to download feed {feed}: {e}")
continue
# Parse the feed
feed = feedparser.parse(response.text)
logger.info(
f"Found {len(feed.entries)} articles in feed {subscription['name']} ({feed.feed.title})"
)
# Add the articles to the list
articles.extend(feed.entries)
return articles
def main() -> int:
# Handle program arguments
ap = argparse.ArgumentParser(
prog="ewp-generate-article-digest",
description="Generates a digest of new articles",
)
ap.add_argument(
"--subscriptions",
help="Path to the subscriptions file",
type=Path,
default=Path(
"~/.config/ewconfig/configs/ewp-rss-feeds/subscriptions.json"
).expanduser(),
)
ap.add_argument(
"--cache-file",
help="Path to the cache file",
type=Path,
default=Path("~/.cache/ewp-rss-feeds.sqlite3").expanduser(),
)
ap.add_argument(
"-v", "--verbose", help="Enable verbose logging", action="store_true"
)
args = ap.parse_args()
# Configure logging
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(levelname)s: %(message)s",
)
# Load the subscriptions file
if not args.subscriptions.exists():
logger.error(f"Subscriptions file {args.subscriptions} does not exist")
return 1
subscriptions = json.loads(args.subscriptions.read_text())
logger.info(f"Found {len(subscriptions)} subscriptions")
# Set up the cache
args.cache_file.parent.mkdir(parents=True, exist_ok=True)
logger.info(f"Using cache file {args.cache_file}")
cache_db = sqlite3.connect(args.cache_file)
cache_db.execute(
"""
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY,
url TEXT,
date_fetched TEXT
)
"""
)
# Create an output buffer
output = {}
# Handle each subscription
for subscription in subscriptions:
logger.info(f"Processing subscription {subscription['name']}")
articles = get_all_articles_for_subscription(subscription)
# Process each article
for article in articles:
# If we need special pre-processing
if "lwn::hide-paid-articles" in subscription.get("filters", []):
if article.get("title", "").startswith("[$]"):
logger.info(f"Skipping paid article {article.title}")
continue
# Determine the article URL
url = article.get("link") or article.get("guid") or None
if url is None:
logger.warning(f"Skipping article with no URL: {article.title}")
continue
# Check if the article is already in the cache
cursor = cache_db.execute(
"SELECT id FROM articles WHERE url = ?", (url,)
)
if cursor.fetchone() is not None:
logger.debug(f"Skipping article {article.title} (already in cache)")
continue
# Add the article to the output and cache it
if subscription['name'] not in output:
output[subscription['name']] = []
output[subscription['name']].append({
"title": article.get("title"),
"link": url,
})
cache_db.execute(
"INSERT INTO articles (url, date_fetched) VALUES (?, datetime('now'))",
(url,),
)
# Sort the output by subscription name alphabetically (A first)
output = dict(sorted(output.items(), key=lambda x: x[0].lower()))
# Build the output
output_str = ""
for subscription, articles in output.items():
logger.debug(f"Building output for {subscription} ({len(articles)} articles)")
output_str += f">> {subscription}\n"
for article in articles:
output_str +=f" - {article['title']}\n"
output_str +=f" URL: {article['link']}\n"
output_str += "\n"
# Print the output
print(output_str)
# Clean up
cache_db.commit()
cache_db.close()
return 0
if __name__ == "__main__":
sys.exit(main())