# remove_sdgduplicates.py from utils.news.utils import DatabaseManager, setup_logging import re logger = setup_logging() def remove_duplicates(): """Remove duplicate news entries keeping only the most recent""" db = DatabaseManager() conn = None cursor = None try: conn = db.get_connection() cursor = conn.cursor() # Get count before cursor.execute("SELECT COUNT(*) FROM news") count_before = cursor.fetchone()[0] logger.info(f"Entries before deduplication: {count_before}") # Find duplicates first cursor.execute(""" WITH normalized_titles AS ( SELECT id, title, lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title FROM news ) SELECT title, COUNT(*) as duplicate_count FROM normalized_titles GROUP BY normalized_title, title HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; """) duplicates = cursor.fetchall() if duplicates: logger.info("\nDuplicates found:") for title, count in duplicates: logger.info(f"- '{title}' ({count} instances)") # Create temporary table cursor.execute(""" CREATE TEMP TABLE news_unique AS WITH normalized_news AS ( SELECT id, title, lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title, link, description, category, published_at, created_at FROM news ), RankedNews AS ( SELECT id, title, link, description, category, published_at, created_at, ROW_NUMBER() OVER ( PARTITION BY normalized_title ORDER BY published_at DESC, created_at DESC ) as rn FROM normalized_news ) SELECT id, title, link, description, category, published_at, created_at FROM RankedNews WHERE rn = 1; """) # Delete all from original table cursor.execute("DELETE FROM news") # Insert unique entries back cursor.execute(""" INSERT INTO news ( id, title, link, description, category, published_at, created_at ) SELECT id, title, link, description, category, published_at, created_at FROM news_unique ORDER BY published_at DESC, created_at DESC; """) # Get count after cursor.execute("SELECT COUNT(*) FROM news") count_after = cursor.fetchone()[0] # Cleanup cursor.execute("DROP TABLE IF EXISTS news_unique") # Reset sequence cursor.execute(""" SELECT setval('news_id_seq', COALESCE((SELECT MAX(id) FROM news), 1), false); """) conn.commit() removed = count_before - count_after logger.info(f"\nRemoved {removed} duplicates") logger.info(f"Entries after deduplication: {count_after}") except Exception as e: if conn: conn.rollback() logger.error(f"Error during deduplication: {str(e)}") raise finally: if cursor: cursor.close() if conn: db.return_connection(conn) db.close_all() if __name__ == "__main__": try: remove_duplicates() logger.info("Deduplication completed successfully") except Exception as e: logger.error(f"Deduplication failed: {str(e)}") exit(1)