RivaCube/remove_sdgduplicates.py

# remove_sdgduplicates.py
from utils.news.utils import DatabaseManager, setup_logging
import re

logger = setup_logging()

def remove_duplicates():
    """Remove duplicate news entries keeping only the most recent"""
    db = DatabaseManager()
    conn = None
    cursor = None

    try:
        conn = db.get_connection()
        cursor = conn.cursor()

        # Get count before
        cursor.execute("SELECT COUNT(*) FROM news")
        count_before = cursor.fetchone()[0]
        logger.info(f"Entries before deduplication: {count_before}")

        # Find duplicates first
        cursor.execute("""
            WITH normalized_titles AS (
                SELECT
                    id,
                    title,
                    lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title
                FROM news
            )
            SELECT
                title,
                COUNT(*) as duplicate_count
            FROM normalized_titles
            GROUP BY normalized_title, title
            HAVING COUNT(*) > 1
            ORDER BY COUNT(*) DESC;
        """)

        duplicates = cursor.fetchall()
        if duplicates:
            logger.info("\nDuplicates found:")
            for title, count in duplicates:
                logger.info(f"- '{title}' ({count} instances)")

        # Create temporary table
        cursor.execute("""
            CREATE TEMP TABLE news_unique AS
            WITH normalized_news AS (
                SELECT
                    id,
                    title,
                    lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title,
                    link,
                    description,
                    category,
                    published_at,
                    created_at
                FROM news
            ),
            RankedNews AS (
                SELECT
                    id,
                    title,
                    link,
                    description,
                    category,
                    published_at,
                    created_at,
                    ROW_NUMBER() OVER (
                        PARTITION BY normalized_title
                        ORDER BY published_at DESC, created_at DESC
                    ) as rn
                FROM normalized_news
            )
            SELECT
                id, title, link, description,
                category, published_at, created_at
            FROM RankedNews
            WHERE rn = 1;
        """)

        # Delete all from original table
        cursor.execute("DELETE FROM news")

        # Insert unique entries back
        cursor.execute("""
            INSERT INTO news (
                id, title, link, description,
                category, published_at, created_at
            )
            SELECT
                id, title, link, description,
                category, published_at, created_at
            FROM news_unique
            ORDER BY published_at DESC, created_at DESC;
        """)

        # Get count after
        cursor.execute("SELECT COUNT(*) FROM news")
        count_after = cursor.fetchone()[0]

        # Cleanup
        cursor.execute("DROP TABLE IF EXISTS news_unique")

        # Reset sequence
        cursor.execute("""
            SELECT setval('news_id_seq', COALESCE((SELECT MAX(id) FROM news), 1), false);
        """)

        conn.commit()

        removed = count_before - count_after
        logger.info(f"\nRemoved {removed} duplicates")
        logger.info(f"Entries after deduplication: {count_after}")

    except Exception as e:
        if conn:
            conn.rollback()
        logger.error(f"Error during deduplication: {str(e)}")
        raise
    finally:
        if cursor:
            cursor.close()
        if conn:
            db.return_connection(conn)
        db.close_all()

if __name__ == "__main__":
    try:
        remove_duplicates()
        logger.info("Deduplication completed successfully")
    except Exception as e:
        logger.error(f"Deduplication failed: {str(e)}")
        exit(1)