171 lines
5.1 KiB
Python
Executable File
171 lines
5.1 KiB
Python
Executable File
#! /usr/bin/env python3
|
|
import argparse
|
|
import sys
|
|
import logging
|
|
import requests
|
|
import feedparser
|
|
import json
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_all_articles_for_subscription(subscription: dict) -> list:
|
|
"""
|
|
Get all articles for a given subscription
|
|
"""
|
|
|
|
# Find all the feeds
|
|
feeds = subscription.get("feeds", [])
|
|
if feeds != 1:
|
|
logger.info(f"Found {len(feeds)} feeds for subscription {subscription['name']}")
|
|
|
|
# Download all the feeds
|
|
articles = []
|
|
for feed in feeds:
|
|
logger.info(f"Downloading feed {feed}")
|
|
|
|
# Make a request. Careful to handle failures
|
|
try:
|
|
response = requests.get(feed, timeout=3.0)
|
|
response.raise_for_status()
|
|
except requests.exceptions.RequestException as e:
|
|
logger.warning(f"Failed to download feed {feed}: {e}")
|
|
continue
|
|
|
|
# Parse the feed
|
|
feed = feedparser.parse(response.text)
|
|
logger.info(
|
|
f"Found {len(feed.entries)} articles in feed {subscription['name']} ({feed.feed.title})"
|
|
)
|
|
|
|
# Add the articles to the list
|
|
articles.extend(feed.entries)
|
|
|
|
return articles
|
|
|
|
|
|
def main() -> int:
|
|
# Handle program arguments
|
|
ap = argparse.ArgumentParser(
|
|
prog="ewp-generate-article-digest",
|
|
description="Generates a digest of new articles",
|
|
)
|
|
ap.add_argument(
|
|
"--subscriptions",
|
|
help="Path to the subscriptions file",
|
|
type=Path,
|
|
default=Path(
|
|
"~/.config/ewconfig/configs/ewp-rss-feeds/subscriptions.json"
|
|
).expanduser(),
|
|
)
|
|
ap.add_argument(
|
|
"--cache-file",
|
|
help="Path to the cache file",
|
|
type=Path,
|
|
default=Path("~/.cache/ewp-rss-feeds.sqlite3").expanduser(),
|
|
)
|
|
ap.add_argument(
|
|
"-v", "--verbose", help="Enable verbose logging", action="store_true"
|
|
)
|
|
args = ap.parse_args()
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(levelname)s: %(message)s",
|
|
)
|
|
|
|
# Load the subscriptions file
|
|
if not args.subscriptions.exists():
|
|
logger.error(f"Subscriptions file {args.subscriptions} does not exist")
|
|
return 1
|
|
subscriptions = json.loads(args.subscriptions.read_text())
|
|
logger.info(f"Found {len(subscriptions)} subscriptions")
|
|
|
|
# Set up the cache
|
|
args.cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
logger.info(f"Using cache file {args.cache_file}")
|
|
cache_db = sqlite3.connect(args.cache_file)
|
|
cache_db.execute(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY,
|
|
url TEXT,
|
|
date_fetched TEXT
|
|
)
|
|
"""
|
|
)
|
|
|
|
# Create an output buffer
|
|
output = {}
|
|
|
|
# Handle each subscription
|
|
for subscription in subscriptions:
|
|
logger.info(f"Processing subscription {subscription['name']}")
|
|
articles = get_all_articles_for_subscription(subscription)
|
|
|
|
# Process each article
|
|
for article in articles:
|
|
|
|
# If we need special pre-processing
|
|
if "lwn::hide-paid-articles" in subscription.get("filters", []):
|
|
if article.get("title", "").startswith("[$]"):
|
|
logger.info(f"Skipping paid article {article.title}")
|
|
continue
|
|
|
|
# Determine the article URL
|
|
url = article.get("link") or article.get("guid") or None
|
|
if url is None:
|
|
logger.warning(f"Skipping article with no URL: {article.title}")
|
|
continue
|
|
|
|
# Check if the article is already in the cache
|
|
cursor = cache_db.execute(
|
|
"SELECT id FROM articles WHERE url = ?", (url,)
|
|
)
|
|
if cursor.fetchone() is not None:
|
|
logger.debug(f"Skipping article {article.title} (already in cache)")
|
|
continue
|
|
|
|
# Add the article to the output and cache it
|
|
if subscription['name'] not in output:
|
|
output[subscription['name']] = []
|
|
|
|
output[subscription['name']].append({
|
|
"title": article.get("title"),
|
|
"link": url,
|
|
})
|
|
|
|
cache_db.execute(
|
|
"INSERT INTO articles (url, date_fetched) VALUES (?, datetime('now'))",
|
|
(url,),
|
|
)
|
|
|
|
# Sort the output by subscription name alphabetically (A first)
|
|
output = dict(sorted(output.items(), key=lambda x: x[0].lower()))
|
|
|
|
# Build the output
|
|
output_str = ""
|
|
for subscription, articles in output.items():
|
|
logger.debug(f"Building output for {subscription} ({len(articles)} articles)")
|
|
output_str += f">> {subscription}\n"
|
|
for article in articles:
|
|
output_str +=f" - {article['title']}\n"
|
|
output_str +=f" URL: {article['link']}\n"
|
|
output_str += "\n"
|
|
|
|
# Print the output
|
|
print(output_str)
|
|
|
|
# Clean up
|
|
cache_db.commit()
|
|
cache_db.close()
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|