RivaCube/update_sdgnews.py
2025-02-04 19:31:18 +01:00

260 lines
10 KiB
Python

# update_sdgnews.py
import os
import time
import re
from datetime import datetime
import feedparser
from urllib.request import Request, urlopen
from urllib.error import URLError
from socket import timeout
from utils.news.utils import DatabaseManager, parse_date, setup_logging
logger = setup_logging()
class NewsUpdater:
def __init__(self):
"""Initialize the NewsUpdater with a database connection"""
logger.info("Initializing NewsUpdater...")
self.db_manager = DatabaseManager()
self.processed_titles = set() # Track processed titles in current session
logger.info("NewsUpdater initialized successfully")
def normalize_title(self, title):
"""Normalize title for comparison"""
if not title:
return ""
# Convert to lowercase and normalize whitespace
title = ' '.join(title.lower().split())
# Remove special characters except letters, numbers, and spaces
title = re.sub(r'[^\w\s]', '', title)
return title.strip()
def get_rss_sources(self):
"""Read RSS sources from the configuration file"""
logger.info("Starting to read RSS sources...")
sources = []
try:
# First try the file in the current directory
if os.path.exists("news_sources.txt"):
file_path = "news_sources.txt"
logger.info("Found news_sources.txt in current directory")
# Then try the utils/news directory
elif os.path.exists("utils/news/news_sources.txt"):
file_path = "utils/news/news_sources.txt"
logger.info("Found news_sources.txt in utils/news directory")
else:
logger.error("news_sources.txt not found in either current directory or utils/news/")
raise FileNotFoundError("news_sources.txt not found in current directory or utils/news/")
logger.info(f"Reading sources from {file_path}")
with open(file_path, "r", encoding='utf-8') as file:
for line in file:
line = line.strip()
if not line or line.startswith('#'):
continue
try:
url, category = [part.strip() for part in line.split(',', 1)]
if url and category:
sources.append((url, category))
logger.info(f"Added source: {url} (Category: {category})")
else:
logger.warning(f"Skipping invalid line (empty URL or category): {line}")
except ValueError:
logger.warning(f"Skipping invalid line (wrong format): {line}")
except FileNotFoundError as e:
logger.error(f"Error: {str(e)}")
raise
except Exception as e:
logger.error(f"Error reading RSS sources: {str(e)}")
raise
logger.info(f"Found {len(sources)} valid RSS sources")
return sources
def fetch_and_parse_feed(self, feed_url):
"""Fetch and parse an RSS feed with retries"""
retries = 3
timeout_seconds = 30
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
for attempt in range(retries):
try:
logger.info(f"Attempt {attempt + 1}/{retries} to fetch {feed_url}")
request = Request(feed_url.strip(), headers=headers)
response = urlopen(request, timeout=timeout_seconds)
feed_content = response.read()
logger.info(f"Successfully fetched content from {feed_url}")
feed = feedparser.parse(feed_content)
logger.info(f"Successfully parsed feed from {feed_url}")
if feed.get('bozo_exception'):
logger.warning(f"Feed parsing warning for {feed_url}: {feed.get('bozo_exception')}")
if 'entries' in feed and feed.entries:
logger.info(f"Found {len(feed.entries)} entries in feed: {feed_url}")
return feed.entries
else:
logger.warning(f"No entries found in feed: {feed_url}")
return []
except timeout:
logger.error(f"Attempt {attempt + 1}/{retries}: The request timed out for {feed_url}")
except URLError as e:
logger.error(f"Attempt {attempt + 1}/{retries}: Error connecting to URL {feed_url}: {str(e)}")
except Exception as e:
logger.error(f"Attempt {attempt + 1}/{retries}: Error fetching or parsing feed {feed_url}: {str(e)}")
if attempt < retries - 1:
logger.info(f"Retrying... ({attempt + 1}/{retries})")
time.sleep(5)
logger.error(f"All retry attempts failed. Could not fetch the feed: {feed_url}")
return []
def is_duplicate(self, cursor, title):
"""Check if a news item is a duplicate using normalized title comparison"""
normalized_title = self.normalize_title(title)
# First check in-memory cache
if normalized_title in self.processed_titles:
logger.info(f"Found duplicate in current session: '{title}'")
return True
# Then check database
cursor.execute("""
WITH normalized_titles AS (
SELECT id,
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', '', 'g')) as norm_title
FROM news
)
SELECT EXISTS(
SELECT 1 FROM normalized_titles
WHERE norm_title = %s
);
""", (normalized_title,))
exists = cursor.fetchone()[0]
if exists:
logger.info(f"Found duplicate in database: '{title}'")
return True
# Add to processed titles if not a duplicate
self.processed_titles.add(normalized_title)
return False
def process_news_item(self, news_item, category):
"""Process a single news item and insert it into the database"""
conn = None
cursor = None
try:
title = news_item.get("title", "").strip()
if not title:
logger.warning("Skipping news item with empty title")
return
url = news_item.get("link", "").strip()
if not url:
logger.warning(f"Skipping news item with empty url: {title}")
return
logger.info(f"Processing news item: {title}")
description = news_item.get("description", "").strip()
published_at = news_item.get("published", "") or news_item.get("updated", "")
if published_at:
published_at = parse_date(published_at)
else:
published_at = datetime.now()
source = news_item.get('source', {}).get('title', 'Unknown')
if isinstance(source, dict):
source = source.get('title', 'Unknown')
conn = self.db_manager.get_connection()
cursor = conn.cursor()
if self.is_duplicate(cursor, title):
logger.info(f"Skipping duplicate news item: {title}")
return
cursor.execute("""
INSERT INTO news (title, url, description, category, date, source)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO NOTHING
RETURNING id;
""", (title, url, description, category, published_at, source))
result = cursor.fetchone()
if result:
news_id = result[0]
conn.commit()
logger.info(f"News item added (ID: {news_id}): {title}")
else:
logger.info(f"News item was not inserted (likely duplicate): {title}")
except Exception as e:
if conn:
conn.rollback()
logger.error(f"Error processing news item '{title}': {str(e)}")
finally:
if cursor:
cursor.close()
if conn:
self.db_manager.return_connection(conn)
def update_news(self):
"""Main function to update news from all sources"""
logger.info("Starting news update process...")
try:
# Ensure table exists with correct schema
logger.info("Checking database schema...")
self.db_manager.ensure_table_exists()
logger.info("Database schema verified")
# Get RSS sources
logger.info("Getting RSS sources...")
sources = self.get_rss_sources()
if not sources:
logger.error("No RSS sources found. Please check news_sources.txt")
return
# Process each RSS source
for url, category in sources:
logger.info(f"Processing feed: {url} (Category: {category})")
entries = self.fetch_and_parse_feed(url)
for entry in entries:
self.process_news_item(entry, category)
logger.info("News update completed successfully.")
except Exception as e:
logger.error(f"Error in update_news: {str(e)}")
raise
finally:
logger.info("Closing database connections...")
self.db_manager.close_all()
logger.info("Database connections closed")
def main():
logger.info("Starting script execution...")
try:
updater = NewsUpdater()
updater.update_news()
except KeyboardInterrupt:
logger.info("Script interrupted by user")
exit(0)
except Exception as e:
logger.error(f"Script execution failed: {str(e)}")
exit(1)
if __name__ == "__main__":
main()