RivaCube/utils/Stocks/historical/collector.py
2025-02-04 19:31:18 +01:00

243 lines
8.3 KiB
Python

# utils/stocks/collector.py
import logging
import requests
import time
from datetime import datetime, timedelta, date
import psycopg2
from psycopg2.extras import execute_values
from typing import Dict, Any, List, Optional, Tuple
from threading import Lock
class RateLimiter:
def __init__(self, calls_per_minute: int):
self.calls_per_minute = calls_per_minute
self.calls = []
self.lock = Lock()
def wait_if_needed(self):
"""Wait if we've exceeded our rate limit"""
now = time.time()
minute_ago = now - 60
with self.lock:
self.calls = [call for call in self.calls if call > minute_ago]
if len(self.calls) >= self.calls_per_minute:
sleep_time = self.calls[0] - minute_ago
if sleep_time > 0:
time.sleep(sleep_time)
self.calls.append(now)
class StockDataClient:
def __init__(self, api_key: str, rate_limit_per_minute: int = 300):
self.api_key = api_key
self.base_url = "https://financialmodelingprep.com/api/v3"
self.rate_limiter = RateLimiter(rate_limit_per_minute)
self.session = requests.Session()
def get_historical_price(self, symbol: str, from_date: date) -> List[Dict]:
"""Fetch historical daily price data for a symbol"""
self.rate_limiter.wait_if_needed()
try:
url = f"{self.base_url}/historical-price-full/{symbol}"
params = {
'apikey': self.api_key,
'from': from_date.strftime('%Y-%m-%d'),
'to': datetime.now().date().strftime('%Y-%m-%d')
}
response = self.session.get(url, params=params)
response.raise_for_status()
data = response.json()
if 'historical' not in data:
logging.warning(f"No historical data found for {symbol}")
return []
return data['historical']
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching data for {symbol}: {e}")
if hasattr(response, 'status_code') and response.status_code == 429:
time.sleep(5)
return self.get_historical_price(symbol, from_date)
return []
def deduplicate_stock_data(data: List[Tuple]) -> List[Tuple]:
"""Deduplicate stock data based on ticker_id and date"""
seen = set()
deduped_data = []
for item in data:
key = (item[0], item[1]) # ticker_id and date
if key not in seen:
seen.add(key)
deduped_data.append(item)
return deduped_data
def batch_insert_stocks(cursor: psycopg2.extensions.cursor, data: List[Tuple]) -> int:
"""Bulk insert stock data with deduplication"""
if not data:
return 0
try:
# Deduplicate data before insertion
deduped_data = deduplicate_stock_data(data)
# Insert in smaller batches to avoid transaction issues
batch_size = 500
total_inserted = 0
for i in range(0, len(deduped_data), batch_size):
batch = deduped_data[i:i + batch_size]
execute_values(cursor, """
INSERT INTO stocks
(ticker_id, date, open, high, low, close, adj_close, volume)
VALUES %s
ON CONFLICT (ticker_id, date) DO UPDATE SET
open = EXCLUDED.open,
high = EXCLUDED.high,
low = EXCLUDED.low,
close = EXCLUDED.close,
adj_close = EXCLUDED.adj_close,
volume = EXCLUDED.volume
""", batch)
total_inserted += len(batch)
return total_inserted
except Exception as e:
logging.error(f"Error in batch insert: {e}")
raise
def process_ticker_batch(
tickers: List[Dict],
client: StockDataClient,
connection: psycopg2.extensions.connection
) -> int:
"""Process a batch of tickers"""
cursor = connection.cursor()
records_processed = 0
data_to_insert = []
try:
for ticker in tickers:
try:
historical_data = client.get_historical_price(
ticker['ticker'],
ticker['last_update']
)
batch_data = [
(ticker['id'],
datetime.strptime(day['date'], '%Y-%m-%d').date(),
day.get('open'),
day.get('high'),
day.get('low'),
day.get('close'),
day.get('adjClose', day.get('close')),
day.get('volume'))
for day in historical_data
]
if batch_data:
# Process each ticker in its own transaction
try:
data_to_insert.extend(batch_data)
if len(data_to_insert) >= 1000:
records_processed += batch_insert_stocks(cursor, data_to_insert)
data_to_insert = []
connection.commit()
except Exception as e:
connection.rollback()
logging.error(f"Error processing batch for {ticker['ticker']}: {e}")
continue
except Exception as e:
logging.error(f"Error processing {ticker['ticker']}: {e}")
continue
# Insert any remaining records
if data_to_insert:
try:
records_processed += batch_insert_stocks(cursor, data_to_insert)
connection.commit()
except Exception as e:
connection.rollback()
logging.error(f"Error processing final batch: {e}")
return records_processed
finally:
cursor.close()
def update_stocks_batch(
tickers: List[Dict],
connection: psycopg2.extensions.connection,
client: StockDataClient,
batch_size: int = 50
) -> int:
"""Update stocks in batches"""
total_records = 0
# Process tickers in batches
for i in range(0, len(tickers), batch_size):
batch = tickers[i:i + batch_size]
try:
records = process_ticker_batch(batch, client, connection)
total_records += records
logging.info(f"Processed batch {i//batch_size + 1}, records: {records}")
except Exception as e:
logging.error(f"Error processing batch {i//batch_size + 1}: {e}")
continue
return total_records
def get_stock_statistics(connection: psycopg2.extensions.connection) -> Dict[str, Any]:
cursor = None
try:
cursor = connection.cursor()
stats = {}
cursor.execute("""
SELECT
(SELECT COUNT(*) FROM stocks) as stocks_count,
(SELECT COUNT(*) FROM tickers) as tickers_count,
(SELECT MAX(date) FROM stocks) as latest_date,
(SELECT COUNT(DISTINCT ticker_id)
FROM stocks
WHERE date >= CURRENT_DATE - INTERVAL '7 days') as recent_updates
""")
row = cursor.fetchone()
if row:
stats['stocks_count'] = row[0]
stats['tickers_count'] = row[1]
stats['latest_stock_date'] = row[2]
stats['recently_updated_tickers'] = row[3]
return stats
except Exception as e:
logging.error(f"Error getting stock statistics: {e}")
return {}
finally:
if cursor:
cursor.close()
def track_invalid_ticker(connection, ticker: str, reason: str):
cursor = None
try:
cursor = connection.cursor()
cursor.execute("""
INSERT INTO invalid_tickers (ticker, reason)
VALUES (%s, %s)
ON CONFLICT (ticker)
DO UPDATE SET
attempts = invalid_tickers.attempts + 1,
last_check = CURRENT_TIMESTAMP
""", (ticker, reason))
connection.commit()
finally:
if cursor:
cursor.close()