# update_sdgnews.py import os import time import re from datetime import datetime import feedparser from urllib.request import Request, urlopen from urllib.error import URLError from socket import timeout from utils.news.utils import DatabaseManager, parse_date, setup_logging logger = setup_logging() class NewsUpdater: def __init__(self): """Initialize the NewsUpdater with a database connection""" logger.info("Initializing NewsUpdater...") self.db_manager = DatabaseManager() self.processed_titles = set() # Track processed titles in current session logger.info("NewsUpdater initialized successfully") def normalize_title(self, title): """Normalize title for comparison""" if not title: return "" # Convert to lowercase and normalize whitespace title = ' '.join(title.lower().split()) # Remove special characters except letters, numbers, and spaces title = re.sub(r'[^\w\s]', '', title) return title.strip() def get_rss_sources(self): """Read RSS sources from the configuration file""" logger.info("Starting to read RSS sources...") sources = [] try: # First try the file in the current directory if os.path.exists("news_sources.txt"): file_path = "news_sources.txt" logger.info("Found news_sources.txt in current directory") # Then try the utils/news directory elif os.path.exists("utils/news/news_sources.txt"): file_path = "utils/news/news_sources.txt" logger.info("Found news_sources.txt in utils/news directory") else: logger.error("news_sources.txt not found in either current directory or utils/news/") raise FileNotFoundError("news_sources.txt not found in current directory or utils/news/") logger.info(f"Reading sources from {file_path}") with open(file_path, "r", encoding='utf-8') as file: for line in file: line = line.strip() if not line or line.startswith('#'): continue try: url, category = [part.strip() for part in line.split(',', 1)] if url and category: sources.append((url, category)) logger.info(f"Added source: {url} (Category: {category})") else: logger.warning(f"Skipping invalid line (empty URL or category): {line}") except ValueError: logger.warning(f"Skipping invalid line (wrong format): {line}") except FileNotFoundError as e: logger.error(f"Error: {str(e)}") raise except Exception as e: logger.error(f"Error reading RSS sources: {str(e)}") raise logger.info(f"Found {len(sources)} valid RSS sources") return sources def fetch_and_parse_feed(self, feed_url): """Fetch and parse an RSS feed with retries""" retries = 3 timeout_seconds = 30 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } for attempt in range(retries): try: logger.info(f"Attempt {attempt + 1}/{retries} to fetch {feed_url}") request = Request(feed_url.strip(), headers=headers) response = urlopen(request, timeout=timeout_seconds) feed_content = response.read() logger.info(f"Successfully fetched content from {feed_url}") feed = feedparser.parse(feed_content) logger.info(f"Successfully parsed feed from {feed_url}") if feed.get('bozo_exception'): logger.warning(f"Feed parsing warning for {feed_url}: {feed.get('bozo_exception')}") if 'entries' in feed and feed.entries: logger.info(f"Found {len(feed.entries)} entries in feed: {feed_url}") return feed.entries else: logger.warning(f"No entries found in feed: {feed_url}") return [] except timeout: logger.error(f"Attempt {attempt + 1}/{retries}: The request timed out for {feed_url}") except URLError as e: logger.error(f"Attempt {attempt + 1}/{retries}: Error connecting to URL {feed_url}: {str(e)}") except Exception as e: logger.error(f"Attempt {attempt + 1}/{retries}: Error fetching or parsing feed {feed_url}: {str(e)}") if attempt < retries - 1: logger.info(f"Retrying... ({attempt + 1}/{retries})") time.sleep(5) logger.error(f"All retry attempts failed. Could not fetch the feed: {feed_url}") return [] def is_duplicate(self, cursor, title): """Check if a news item is a duplicate using normalized title comparison""" normalized_title = self.normalize_title(title) # First check in-memory cache if normalized_title in self.processed_titles: logger.info(f"Found duplicate in current session: '{title}'") return True # Then check database cursor.execute(""" WITH normalized_titles AS ( SELECT id, lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', '', 'g')) as norm_title FROM news ) SELECT EXISTS( SELECT 1 FROM normalized_titles WHERE norm_title = %s ); """, (normalized_title,)) exists = cursor.fetchone()[0] if exists: logger.info(f"Found duplicate in database: '{title}'") return True # Add to processed titles if not a duplicate self.processed_titles.add(normalized_title) return False def process_news_item(self, news_item, category): """Process a single news item and insert it into the database""" conn = None cursor = None try: title = news_item.get("title", "").strip() if not title: logger.warning("Skipping news item with empty title") return url = news_item.get("link", "").strip() if not url: logger.warning(f"Skipping news item with empty url: {title}") return logger.info(f"Processing news item: {title}") description = news_item.get("description", "").strip() published_at = news_item.get("published", "") or news_item.get("updated", "") if published_at: published_at = parse_date(published_at) else: published_at = datetime.now() source = news_item.get('source', {}).get('title', 'Unknown') if isinstance(source, dict): source = source.get('title', 'Unknown') conn = self.db_manager.get_connection() cursor = conn.cursor() if self.is_duplicate(cursor, title): logger.info(f"Skipping duplicate news item: {title}") return cursor.execute(""" INSERT INTO news (title, url, description, category, date, source) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (url) DO NOTHING RETURNING id; """, (title, url, description, category, published_at, source)) result = cursor.fetchone() if result: news_id = result[0] conn.commit() logger.info(f"News item added (ID: {news_id}): {title}") else: logger.info(f"News item was not inserted (likely duplicate): {title}") except Exception as e: if conn: conn.rollback() logger.error(f"Error processing news item '{title}': {str(e)}") finally: if cursor: cursor.close() if conn: self.db_manager.return_connection(conn) def update_news(self): """Main function to update news from all sources""" logger.info("Starting news update process...") try: # Ensure table exists with correct schema logger.info("Checking database schema...") self.db_manager.ensure_table_exists() logger.info("Database schema verified") # Get RSS sources logger.info("Getting RSS sources...") sources = self.get_rss_sources() if not sources: logger.error("No RSS sources found. Please check news_sources.txt") return # Process each RSS source for url, category in sources: logger.info(f"Processing feed: {url} (Category: {category})") entries = self.fetch_and_parse_feed(url) for entry in entries: self.process_news_item(entry, category) logger.info("News update completed successfully.") except Exception as e: logger.error(f"Error in update_news: {str(e)}") raise finally: logger.info("Closing database connections...") self.db_manager.close_all() logger.info("Database connections closed") def main(): logger.info("Starting script execution...") try: updater = NewsUpdater() updater.update_news() except KeyboardInterrupt: logger.info("Script interrupted by user") exit(0) except Exception as e: logger.error(f"Script execution failed: {str(e)}") exit(1) if __name__ == "__main__": main()