RivaCube/remove_sdgduplicates.py
2025-02-04 19:31:18 +01:00

136 lines
4.1 KiB
Python

# remove_sdgduplicates.py
from utils.news.utils import DatabaseManager, setup_logging
import re
logger = setup_logging()
def remove_duplicates():
"""Remove duplicate news entries keeping only the most recent"""
db = DatabaseManager()
conn = None
cursor = None
try:
conn = db.get_connection()
cursor = conn.cursor()
# Get count before
cursor.execute("SELECT COUNT(*) FROM news")
count_before = cursor.fetchone()[0]
logger.info(f"Entries before deduplication: {count_before}")
# Find duplicates first
cursor.execute("""
WITH normalized_titles AS (
SELECT
id,
title,
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title
FROM news
)
SELECT
title,
COUNT(*) as duplicate_count
FROM normalized_titles
GROUP BY normalized_title, title
HAVING COUNT(*) > 1
ORDER BY COUNT(*) DESC;
""")
duplicates = cursor.fetchall()
if duplicates:
logger.info("\nDuplicates found:")
for title, count in duplicates:
logger.info(f"- '{title}' ({count} instances)")
# Create temporary table
cursor.execute("""
CREATE TEMP TABLE news_unique AS
WITH normalized_news AS (
SELECT
id,
title,
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title,
link,
description,
category,
published_at,
created_at
FROM news
),
RankedNews AS (
SELECT
id,
title,
link,
description,
category,
published_at,
created_at,
ROW_NUMBER() OVER (
PARTITION BY normalized_title
ORDER BY published_at DESC, created_at DESC
) as rn
FROM normalized_news
)
SELECT
id, title, link, description,
category, published_at, created_at
FROM RankedNews
WHERE rn = 1;
""")
# Delete all from original table
cursor.execute("DELETE FROM news")
# Insert unique entries back
cursor.execute("""
INSERT INTO news (
id, title, link, description,
category, published_at, created_at
)
SELECT
id, title, link, description,
category, published_at, created_at
FROM news_unique
ORDER BY published_at DESC, created_at DESC;
""")
# Get count after
cursor.execute("SELECT COUNT(*) FROM news")
count_after = cursor.fetchone()[0]
# Cleanup
cursor.execute("DROP TABLE IF EXISTS news_unique")
# Reset sequence
cursor.execute("""
SELECT setval('news_id_seq', COALESCE((SELECT MAX(id) FROM news), 1), false);
""")
conn.commit()
removed = count_before - count_after
logger.info(f"\nRemoved {removed} duplicates")
logger.info(f"Entries after deduplication: {count_after}")
except Exception as e:
if conn:
conn.rollback()
logger.error(f"Error during deduplication: {str(e)}")
raise
finally:
if cursor:
cursor.close()
if conn:
db.return_connection(conn)
db.close_all()
if __name__ == "__main__":
try:
remove_duplicates()
logger.info("Deduplication completed successfully")
except Exception as e:
logger.error(f"Deduplication failed: {str(e)}")
exit(1)