260 lines
10 KiB
Python
260 lines
10 KiB
Python
# update_sdgnews.py
|
|
import os
|
|
import time
|
|
import re
|
|
from datetime import datetime
|
|
import feedparser
|
|
from urllib.request import Request, urlopen
|
|
from urllib.error import URLError
|
|
from socket import timeout
|
|
from utils.news.utils import DatabaseManager, parse_date, setup_logging
|
|
|
|
logger = setup_logging()
|
|
|
|
class NewsUpdater:
|
|
def __init__(self):
|
|
"""Initialize the NewsUpdater with a database connection"""
|
|
logger.info("Initializing NewsUpdater...")
|
|
self.db_manager = DatabaseManager()
|
|
self.processed_titles = set() # Track processed titles in current session
|
|
logger.info("NewsUpdater initialized successfully")
|
|
|
|
def normalize_title(self, title):
|
|
"""Normalize title for comparison"""
|
|
if not title:
|
|
return ""
|
|
# Convert to lowercase and normalize whitespace
|
|
title = ' '.join(title.lower().split())
|
|
# Remove special characters except letters, numbers, and spaces
|
|
title = re.sub(r'[^\w\s]', '', title)
|
|
return title.strip()
|
|
|
|
def get_rss_sources(self):
|
|
"""Read RSS sources from the configuration file"""
|
|
logger.info("Starting to read RSS sources...")
|
|
sources = []
|
|
try:
|
|
# First try the file in the current directory
|
|
if os.path.exists("news_sources.txt"):
|
|
file_path = "news_sources.txt"
|
|
logger.info("Found news_sources.txt in current directory")
|
|
# Then try the utils/news directory
|
|
elif os.path.exists("utils/news/news_sources.txt"):
|
|
file_path = "utils/news/news_sources.txt"
|
|
logger.info("Found news_sources.txt in utils/news directory")
|
|
else:
|
|
logger.error("news_sources.txt not found in either current directory or utils/news/")
|
|
raise FileNotFoundError("news_sources.txt not found in current directory or utils/news/")
|
|
|
|
logger.info(f"Reading sources from {file_path}")
|
|
with open(file_path, "r", encoding='utf-8') as file:
|
|
for line in file:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
try:
|
|
url, category = [part.strip() for part in line.split(',', 1)]
|
|
if url and category:
|
|
sources.append((url, category))
|
|
logger.info(f"Added source: {url} (Category: {category})")
|
|
else:
|
|
logger.warning(f"Skipping invalid line (empty URL or category): {line}")
|
|
except ValueError:
|
|
logger.warning(f"Skipping invalid line (wrong format): {line}")
|
|
|
|
except FileNotFoundError as e:
|
|
logger.error(f"Error: {str(e)}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error reading RSS sources: {str(e)}")
|
|
raise
|
|
|
|
logger.info(f"Found {len(sources)} valid RSS sources")
|
|
return sources
|
|
|
|
def fetch_and_parse_feed(self, feed_url):
|
|
"""Fetch and parse an RSS feed with retries"""
|
|
retries = 3
|
|
timeout_seconds = 30
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
|
|
}
|
|
|
|
for attempt in range(retries):
|
|
try:
|
|
logger.info(f"Attempt {attempt + 1}/{retries} to fetch {feed_url}")
|
|
request = Request(feed_url.strip(), headers=headers)
|
|
response = urlopen(request, timeout=timeout_seconds)
|
|
feed_content = response.read()
|
|
logger.info(f"Successfully fetched content from {feed_url}")
|
|
|
|
feed = feedparser.parse(feed_content)
|
|
logger.info(f"Successfully parsed feed from {feed_url}")
|
|
|
|
if feed.get('bozo_exception'):
|
|
logger.warning(f"Feed parsing warning for {feed_url}: {feed.get('bozo_exception')}")
|
|
|
|
if 'entries' in feed and feed.entries:
|
|
logger.info(f"Found {len(feed.entries)} entries in feed: {feed_url}")
|
|
return feed.entries
|
|
else:
|
|
logger.warning(f"No entries found in feed: {feed_url}")
|
|
return []
|
|
|
|
except timeout:
|
|
logger.error(f"Attempt {attempt + 1}/{retries}: The request timed out for {feed_url}")
|
|
except URLError as e:
|
|
logger.error(f"Attempt {attempt + 1}/{retries}: Error connecting to URL {feed_url}: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Attempt {attempt + 1}/{retries}: Error fetching or parsing feed {feed_url}: {str(e)}")
|
|
|
|
if attempt < retries - 1:
|
|
logger.info(f"Retrying... ({attempt + 1}/{retries})")
|
|
time.sleep(5)
|
|
|
|
logger.error(f"All retry attempts failed. Could not fetch the feed: {feed_url}")
|
|
return []
|
|
|
|
def is_duplicate(self, cursor, title):
|
|
"""Check if a news item is a duplicate using normalized title comparison"""
|
|
normalized_title = self.normalize_title(title)
|
|
|
|
# First check in-memory cache
|
|
if normalized_title in self.processed_titles:
|
|
logger.info(f"Found duplicate in current session: '{title}'")
|
|
return True
|
|
|
|
# Then check database
|
|
cursor.execute("""
|
|
WITH normalized_titles AS (
|
|
SELECT id,
|
|
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', '', 'g')) as norm_title
|
|
FROM news
|
|
)
|
|
SELECT EXISTS(
|
|
SELECT 1 FROM normalized_titles
|
|
WHERE norm_title = %s
|
|
);
|
|
""", (normalized_title,))
|
|
|
|
exists = cursor.fetchone()[0]
|
|
if exists:
|
|
logger.info(f"Found duplicate in database: '{title}'")
|
|
return True
|
|
|
|
# Add to processed titles if not a duplicate
|
|
self.processed_titles.add(normalized_title)
|
|
return False
|
|
|
|
def process_news_item(self, news_item, category):
|
|
"""Process a single news item and insert it into the database"""
|
|
conn = None
|
|
cursor = None
|
|
try:
|
|
title = news_item.get("title", "").strip()
|
|
if not title:
|
|
logger.warning("Skipping news item with empty title")
|
|
return
|
|
|
|
url = news_item.get("link", "").strip()
|
|
if not url:
|
|
logger.warning(f"Skipping news item with empty url: {title}")
|
|
return
|
|
|
|
logger.info(f"Processing news item: {title}")
|
|
description = news_item.get("description", "").strip()
|
|
published_at = news_item.get("published", "") or news_item.get("updated", "")
|
|
|
|
if published_at:
|
|
published_at = parse_date(published_at)
|
|
else:
|
|
published_at = datetime.now()
|
|
|
|
source = news_item.get('source', {}).get('title', 'Unknown')
|
|
if isinstance(source, dict):
|
|
source = source.get('title', 'Unknown')
|
|
|
|
conn = self.db_manager.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
if self.is_duplicate(cursor, title):
|
|
logger.info(f"Skipping duplicate news item: {title}")
|
|
return
|
|
|
|
cursor.execute("""
|
|
INSERT INTO news (title, url, description, category, date, source)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (url) DO NOTHING
|
|
RETURNING id;
|
|
""", (title, url, description, category, published_at, source))
|
|
|
|
result = cursor.fetchone()
|
|
if result:
|
|
news_id = result[0]
|
|
conn.commit()
|
|
logger.info(f"News item added (ID: {news_id}): {title}")
|
|
else:
|
|
logger.info(f"News item was not inserted (likely duplicate): {title}")
|
|
|
|
except Exception as e:
|
|
if conn:
|
|
conn.rollback()
|
|
logger.error(f"Error processing news item '{title}': {str(e)}")
|
|
finally:
|
|
if cursor:
|
|
cursor.close()
|
|
if conn:
|
|
self.db_manager.return_connection(conn)
|
|
|
|
def update_news(self):
|
|
"""Main function to update news from all sources"""
|
|
logger.info("Starting news update process...")
|
|
try:
|
|
# Ensure table exists with correct schema
|
|
logger.info("Checking database schema...")
|
|
self.db_manager.ensure_table_exists()
|
|
logger.info("Database schema verified")
|
|
|
|
# Get RSS sources
|
|
logger.info("Getting RSS sources...")
|
|
sources = self.get_rss_sources()
|
|
|
|
if not sources:
|
|
logger.error("No RSS sources found. Please check news_sources.txt")
|
|
return
|
|
|
|
# Process each RSS source
|
|
for url, category in sources:
|
|
logger.info(f"Processing feed: {url} (Category: {category})")
|
|
entries = self.fetch_and_parse_feed(url)
|
|
|
|
for entry in entries:
|
|
self.process_news_item(entry, category)
|
|
|
|
logger.info("News update completed successfully.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in update_news: {str(e)}")
|
|
raise
|
|
finally:
|
|
logger.info("Closing database connections...")
|
|
self.db_manager.close_all()
|
|
logger.info("Database connections closed")
|
|
|
|
def main():
|
|
logger.info("Starting script execution...")
|
|
try:
|
|
updater = NewsUpdater()
|
|
updater.update_news()
|
|
except KeyboardInterrupt:
|
|
logger.info("Script interrupted by user")
|
|
exit(0)
|
|
except Exception as e:
|
|
logger.error(f"Script execution failed: {str(e)}")
|
|
exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|