RivaCube/update_sdgnews.py

# update_sdgnews.py
import os
import time
import re
from datetime import datetime
import feedparser
from urllib.request import Request, urlopen
from urllib.error import URLError
from socket import timeout
from utils.news.utils import DatabaseManager, parse_date, setup_logging

logger = setup_logging()

class NewsUpdater:
    def __init__(self):
        """Initialize the NewsUpdater with a database connection"""
        logger.info("Initializing NewsUpdater...")
        self.db_manager = DatabaseManager()
        self.processed_titles = set()  # Track processed titles in current session
        logger.info("NewsUpdater initialized successfully")

    def normalize_title(self, title):
        """Normalize title for comparison"""
        if not title:
            return ""
        # Convert to lowercase and normalize whitespace
        title = ' '.join(title.lower().split())
        # Remove special characters except letters, numbers, and spaces
        title = re.sub(r'[^\w\s]', '', title)
        return title.strip()

    def get_rss_sources(self):
        """Read RSS sources from the configuration file"""
        logger.info("Starting to read RSS sources...")
        sources = []
        try:
            # First try the file in the current directory
            if os.path.exists("news_sources.txt"):
                file_path = "news_sources.txt"
                logger.info("Found news_sources.txt in current directory")
            # Then try the utils/news directory
            elif os.path.exists("utils/news/news_sources.txt"):
                file_path = "utils/news/news_sources.txt"
                logger.info("Found news_sources.txt in utils/news directory")
            else:
                logger.error("news_sources.txt not found in either current directory or utils/news/")
                raise FileNotFoundError("news_sources.txt not found in current directory or utils/news/")

            logger.info(f"Reading sources from {file_path}")
            with open(file_path, "r", encoding='utf-8') as file:
                for line in file:
                    line = line.strip()
                    if not line or line.startswith('#'):
                        continue

                    try:
                        url, category = [part.strip() for part in line.split(',', 1)]
                        if url and category:
                            sources.append((url, category))
                            logger.info(f"Added source: {url} (Category: {category})")
                        else:
                            logger.warning(f"Skipping invalid line (empty URL or category): {line}")
                    except ValueError:
                        logger.warning(f"Skipping invalid line (wrong format): {line}")

        except FileNotFoundError as e:
            logger.error(f"Error: {str(e)}")
            raise
        except Exception as e:
            logger.error(f"Error reading RSS sources: {str(e)}")
            raise

        logger.info(f"Found {len(sources)} valid RSS sources")
        return sources

    def fetch_and_parse_feed(self, feed_url):
        """Fetch and parse an RSS feed with retries"""
        retries = 3
        timeout_seconds = 30

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }

        for attempt in range(retries):
            try:
                logger.info(f"Attempt {attempt + 1}/{retries} to fetch {feed_url}")
                request = Request(feed_url.strip(), headers=headers)
                response = urlopen(request, timeout=timeout_seconds)
                feed_content = response.read()
                logger.info(f"Successfully fetched content from {feed_url}")

                feed = feedparser.parse(feed_content)
                logger.info(f"Successfully parsed feed from {feed_url}")

                if feed.get('bozo_exception'):
                    logger.warning(f"Feed parsing warning for {feed_url}: {feed.get('bozo_exception')}")

                if 'entries' in feed and feed.entries:
                    logger.info(f"Found {len(feed.entries)} entries in feed: {feed_url}")
                    return feed.entries
                else:
                    logger.warning(f"No entries found in feed: {feed_url}")
                    return []

            except timeout:
                logger.error(f"Attempt {attempt + 1}/{retries}: The request timed out for {feed_url}")
            except URLError as e:
                logger.error(f"Attempt {attempt + 1}/{retries}: Error connecting to URL {feed_url}: {str(e)}")
            except Exception as e:
                logger.error(f"Attempt {attempt + 1}/{retries}: Error fetching or parsing feed {feed_url}: {str(e)}")

            if attempt < retries - 1:
                logger.info(f"Retrying... ({attempt + 1}/{retries})")
                time.sleep(5)

        logger.error(f"All retry attempts failed. Could not fetch the feed: {feed_url}")
        return []

    def is_duplicate(self, cursor, title):
        """Check if a news item is a duplicate using normalized title comparison"""
        normalized_title = self.normalize_title(title)

        # First check in-memory cache
        if normalized_title in self.processed_titles:
            logger.info(f"Found duplicate in current session: '{title}'")
            return True

        # Then check database
        cursor.execute("""
            WITH normalized_titles AS (
                SELECT id,
                       lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', '', 'g')) as norm_title
                FROM news
            )
            SELECT EXISTS(
                SELECT 1 FROM normalized_titles
                WHERE norm_title = %s
            );
        """, (normalized_title,))

        exists = cursor.fetchone()[0]
        if exists:
            logger.info(f"Found duplicate in database: '{title}'")
            return True

        # Add to processed titles if not a duplicate
        self.processed_titles.add(normalized_title)
        return False

    def process_news_item(self, news_item, category):
        """Process a single news item and insert it into the database"""
        conn = None
        cursor = None
        try:
            title = news_item.get("title", "").strip()
            if not title:
                logger.warning("Skipping news item with empty title")
                return

            url = news_item.get("link", "").strip()
            if not url:
                logger.warning(f"Skipping news item with empty url: {title}")
                return

            logger.info(f"Processing news item: {title}")
            description = news_item.get("description", "").strip()
            published_at = news_item.get("published", "") or news_item.get("updated", "")

            if published_at:
                published_at = parse_date(published_at)
            else:
                published_at = datetime.now()

            source = news_item.get('source', {}).get('title', 'Unknown')
            if isinstance(source, dict):
                source = source.get('title', 'Unknown')

            conn = self.db_manager.get_connection()
            cursor = conn.cursor()

            if self.is_duplicate(cursor, title):
                logger.info(f"Skipping duplicate news item: {title}")
                return

            cursor.execute("""
                INSERT INTO news (title, url, description, category, date, source)
                VALUES (%s, %s, %s, %s, %s, %s)
                ON CONFLICT (url) DO NOTHING
                RETURNING id;
            """, (title, url, description, category, published_at, source))

            result = cursor.fetchone()
            if result:
                news_id = result[0]
                conn.commit()
                logger.info(f"News item added (ID: {news_id}): {title}")
            else:
                logger.info(f"News item was not inserted (likely duplicate): {title}")

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"Error processing news item '{title}': {str(e)}")
        finally:
            if cursor:
                cursor.close()
            if conn:
                self.db_manager.return_connection(conn)

    def update_news(self):
        """Main function to update news from all sources"""
        logger.info("Starting news update process...")
        try:
            # Ensure table exists with correct schema
            logger.info("Checking database schema...")
            self.db_manager.ensure_table_exists()
            logger.info("Database schema verified")

            # Get RSS sources
            logger.info("Getting RSS sources...")
            sources = self.get_rss_sources()

            if not sources:
                logger.error("No RSS sources found. Please check news_sources.txt")
                return

            # Process each RSS source
            for url, category in sources:
                logger.info(f"Processing feed: {url} (Category: {category})")
                entries = self.fetch_and_parse_feed(url)

                for entry in entries:
                    self.process_news_item(entry, category)

            logger.info("News update completed successfully.")

        except Exception as e:
            logger.error(f"Error in update_news: {str(e)}")
            raise
        finally:
            logger.info("Closing database connections...")
            self.db_manager.close_all()
            logger.info("Database connections closed")

def main():
    logger.info("Starting script execution...")
    try:
        updater = NewsUpdater()
        updater.update_news()
    except KeyboardInterrupt:
        logger.info("Script interrupted by user")
        exit(0)
    except Exception as e:
        logger.error(f"Script execution failed: {str(e)}")
        exit(1)

if __name__ == "__main__":
    main()