136 lines
4.1 KiB
Python
136 lines
4.1 KiB
Python
# remove_sdgduplicates.py
|
|
from utils.news.utils import DatabaseManager, setup_logging
|
|
import re
|
|
|
|
logger = setup_logging()
|
|
|
|
def remove_duplicates():
|
|
"""Remove duplicate news entries keeping only the most recent"""
|
|
db = DatabaseManager()
|
|
conn = None
|
|
cursor = None
|
|
|
|
try:
|
|
conn = db.get_connection()
|
|
cursor = conn.cursor()
|
|
|
|
# Get count before
|
|
cursor.execute("SELECT COUNT(*) FROM news")
|
|
count_before = cursor.fetchone()[0]
|
|
logger.info(f"Entries before deduplication: {count_before}")
|
|
|
|
# Find duplicates first
|
|
cursor.execute("""
|
|
WITH normalized_titles AS (
|
|
SELECT
|
|
id,
|
|
title,
|
|
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title
|
|
FROM news
|
|
)
|
|
SELECT
|
|
title,
|
|
COUNT(*) as duplicate_count
|
|
FROM normalized_titles
|
|
GROUP BY normalized_title, title
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY COUNT(*) DESC;
|
|
""")
|
|
|
|
duplicates = cursor.fetchall()
|
|
if duplicates:
|
|
logger.info("\nDuplicates found:")
|
|
for title, count in duplicates:
|
|
logger.info(f"- '{title}' ({count} instances)")
|
|
|
|
# Create temporary table
|
|
cursor.execute("""
|
|
CREATE TEMP TABLE news_unique AS
|
|
WITH normalized_news AS (
|
|
SELECT
|
|
id,
|
|
title,
|
|
lower(regexp_replace(regexp_replace(title, '\s+', ' ', 'g'), '[^\w\s]', ' ', 'g')) as normalized_title,
|
|
link,
|
|
description,
|
|
category,
|
|
published_at,
|
|
created_at
|
|
FROM news
|
|
),
|
|
RankedNews AS (
|
|
SELECT
|
|
id,
|
|
title,
|
|
link,
|
|
description,
|
|
category,
|
|
published_at,
|
|
created_at,
|
|
ROW_NUMBER() OVER (
|
|
PARTITION BY normalized_title
|
|
ORDER BY published_at DESC, created_at DESC
|
|
) as rn
|
|
FROM normalized_news
|
|
)
|
|
SELECT
|
|
id, title, link, description,
|
|
category, published_at, created_at
|
|
FROM RankedNews
|
|
WHERE rn = 1;
|
|
""")
|
|
|
|
# Delete all from original table
|
|
cursor.execute("DELETE FROM news")
|
|
|
|
# Insert unique entries back
|
|
cursor.execute("""
|
|
INSERT INTO news (
|
|
id, title, link, description,
|
|
category, published_at, created_at
|
|
)
|
|
SELECT
|
|
id, title, link, description,
|
|
category, published_at, created_at
|
|
FROM news_unique
|
|
ORDER BY published_at DESC, created_at DESC;
|
|
""")
|
|
|
|
# Get count after
|
|
cursor.execute("SELECT COUNT(*) FROM news")
|
|
count_after = cursor.fetchone()[0]
|
|
|
|
# Cleanup
|
|
cursor.execute("DROP TABLE IF EXISTS news_unique")
|
|
|
|
# Reset sequence
|
|
cursor.execute("""
|
|
SELECT setval('news_id_seq', COALESCE((SELECT MAX(id) FROM news), 1), false);
|
|
""")
|
|
|
|
conn.commit()
|
|
|
|
removed = count_before - count_after
|
|
logger.info(f"\nRemoved {removed} duplicates")
|
|
logger.info(f"Entries after deduplication: {count_after}")
|
|
|
|
except Exception as e:
|
|
if conn:
|
|
conn.rollback()
|
|
logger.error(f"Error during deduplication: {str(e)}")
|
|
raise
|
|
finally:
|
|
if cursor:
|
|
cursor.close()
|
|
if conn:
|
|
db.return_connection(conn)
|
|
db.close_all()
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
remove_duplicates()
|
|
logger.info("Deduplication completed successfully")
|
|
except Exception as e:
|
|
logger.error(f"Deduplication failed: {str(e)}")
|
|
exit(1)
|