243 lines
8.3 KiB
Python
243 lines
8.3 KiB
Python
# utils/stocks/collector.py
|
|
import logging
|
|
import requests
|
|
import time
|
|
from datetime import datetime, timedelta, date
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from threading import Lock
|
|
|
|
class RateLimiter:
|
|
def __init__(self, calls_per_minute: int):
|
|
self.calls_per_minute = calls_per_minute
|
|
self.calls = []
|
|
self.lock = Lock()
|
|
|
|
def wait_if_needed(self):
|
|
"""Wait if we've exceeded our rate limit"""
|
|
now = time.time()
|
|
minute_ago = now - 60
|
|
|
|
with self.lock:
|
|
self.calls = [call for call in self.calls if call > minute_ago]
|
|
if len(self.calls) >= self.calls_per_minute:
|
|
sleep_time = self.calls[0] - minute_ago
|
|
if sleep_time > 0:
|
|
time.sleep(sleep_time)
|
|
self.calls.append(now)
|
|
|
|
class StockDataClient:
|
|
def __init__(self, api_key: str, rate_limit_per_minute: int = 300):
|
|
self.api_key = api_key
|
|
self.base_url = "https://financialmodelingprep.com/api/v3"
|
|
self.rate_limiter = RateLimiter(rate_limit_per_minute)
|
|
self.session = requests.Session()
|
|
|
|
def get_historical_price(self, symbol: str, from_date: date) -> List[Dict]:
|
|
"""Fetch historical daily price data for a symbol"""
|
|
self.rate_limiter.wait_if_needed()
|
|
|
|
try:
|
|
url = f"{self.base_url}/historical-price-full/{symbol}"
|
|
params = {
|
|
'apikey': self.api_key,
|
|
'from': from_date.strftime('%Y-%m-%d'),
|
|
'to': datetime.now().date().strftime('%Y-%m-%d')
|
|
}
|
|
|
|
response = self.session.get(url, params=params)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if 'historical' not in data:
|
|
logging.warning(f"No historical data found for {symbol}")
|
|
return []
|
|
|
|
return data['historical']
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logging.error(f"Error fetching data for {symbol}: {e}")
|
|
if hasattr(response, 'status_code') and response.status_code == 429:
|
|
time.sleep(5)
|
|
return self.get_historical_price(symbol, from_date)
|
|
return []
|
|
|
|
def deduplicate_stock_data(data: List[Tuple]) -> List[Tuple]:
|
|
"""Deduplicate stock data based on ticker_id and date"""
|
|
seen = set()
|
|
deduped_data = []
|
|
|
|
for item in data:
|
|
key = (item[0], item[1]) # ticker_id and date
|
|
if key not in seen:
|
|
seen.add(key)
|
|
deduped_data.append(item)
|
|
|
|
return deduped_data
|
|
|
|
def batch_insert_stocks(cursor: psycopg2.extensions.cursor, data: List[Tuple]) -> int:
|
|
"""Bulk insert stock data with deduplication"""
|
|
if not data:
|
|
return 0
|
|
|
|
try:
|
|
# Deduplicate data before insertion
|
|
deduped_data = deduplicate_stock_data(data)
|
|
|
|
# Insert in smaller batches to avoid transaction issues
|
|
batch_size = 500
|
|
total_inserted = 0
|
|
|
|
for i in range(0, len(deduped_data), batch_size):
|
|
batch = deduped_data[i:i + batch_size]
|
|
execute_values(cursor, """
|
|
INSERT INTO stocks
|
|
(ticker_id, date, open, high, low, close, adj_close, volume)
|
|
VALUES %s
|
|
ON CONFLICT (ticker_id, date) DO UPDATE SET
|
|
open = EXCLUDED.open,
|
|
high = EXCLUDED.high,
|
|
low = EXCLUDED.low,
|
|
close = EXCLUDED.close,
|
|
adj_close = EXCLUDED.adj_close,
|
|
volume = EXCLUDED.volume
|
|
""", batch)
|
|
total_inserted += len(batch)
|
|
|
|
return total_inserted
|
|
except Exception as e:
|
|
logging.error(f"Error in batch insert: {e}")
|
|
raise
|
|
|
|
def process_ticker_batch(
|
|
tickers: List[Dict],
|
|
client: StockDataClient,
|
|
connection: psycopg2.extensions.connection
|
|
) -> int:
|
|
"""Process a batch of tickers"""
|
|
cursor = connection.cursor()
|
|
records_processed = 0
|
|
data_to_insert = []
|
|
|
|
try:
|
|
for ticker in tickers:
|
|
try:
|
|
historical_data = client.get_historical_price(
|
|
ticker['ticker'],
|
|
ticker['last_update']
|
|
)
|
|
|
|
batch_data = [
|
|
(ticker['id'],
|
|
datetime.strptime(day['date'], '%Y-%m-%d').date(),
|
|
day.get('open'),
|
|
day.get('high'),
|
|
day.get('low'),
|
|
day.get('close'),
|
|
day.get('adjClose', day.get('close')),
|
|
day.get('volume'))
|
|
for day in historical_data
|
|
]
|
|
|
|
if batch_data:
|
|
# Process each ticker in its own transaction
|
|
try:
|
|
data_to_insert.extend(batch_data)
|
|
if len(data_to_insert) >= 1000:
|
|
records_processed += batch_insert_stocks(cursor, data_to_insert)
|
|
data_to_insert = []
|
|
connection.commit()
|
|
except Exception as e:
|
|
connection.rollback()
|
|
logging.error(f"Error processing batch for {ticker['ticker']}: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error processing {ticker['ticker']}: {e}")
|
|
continue
|
|
|
|
# Insert any remaining records
|
|
if data_to_insert:
|
|
try:
|
|
records_processed += batch_insert_stocks(cursor, data_to_insert)
|
|
connection.commit()
|
|
except Exception as e:
|
|
connection.rollback()
|
|
logging.error(f"Error processing final batch: {e}")
|
|
|
|
return records_processed
|
|
|
|
finally:
|
|
cursor.close()
|
|
|
|
def update_stocks_batch(
|
|
tickers: List[Dict],
|
|
connection: psycopg2.extensions.connection,
|
|
client: StockDataClient,
|
|
batch_size: int = 50
|
|
) -> int:
|
|
"""Update stocks in batches"""
|
|
total_records = 0
|
|
|
|
# Process tickers in batches
|
|
for i in range(0, len(tickers), batch_size):
|
|
batch = tickers[i:i + batch_size]
|
|
try:
|
|
records = process_ticker_batch(batch, client, connection)
|
|
total_records += records
|
|
logging.info(f"Processed batch {i//batch_size + 1}, records: {records}")
|
|
except Exception as e:
|
|
logging.error(f"Error processing batch {i//batch_size + 1}: {e}")
|
|
continue
|
|
|
|
return total_records
|
|
|
|
def get_stock_statistics(connection: psycopg2.extensions.connection) -> Dict[str, Any]:
|
|
cursor = None
|
|
try:
|
|
cursor = connection.cursor()
|
|
stats = {}
|
|
|
|
cursor.execute("""
|
|
SELECT
|
|
(SELECT COUNT(*) FROM stocks) as stocks_count,
|
|
(SELECT COUNT(*) FROM tickers) as tickers_count,
|
|
(SELECT MAX(date) FROM stocks) as latest_date,
|
|
(SELECT COUNT(DISTINCT ticker_id)
|
|
FROM stocks
|
|
WHERE date >= CURRENT_DATE - INTERVAL '7 days') as recent_updates
|
|
""")
|
|
|
|
row = cursor.fetchone()
|
|
if row:
|
|
stats['stocks_count'] = row[0]
|
|
stats['tickers_count'] = row[1]
|
|
stats['latest_stock_date'] = row[2]
|
|
stats['recently_updated_tickers'] = row[3]
|
|
|
|
return stats
|
|
except Exception as e:
|
|
logging.error(f"Error getting stock statistics: {e}")
|
|
return {}
|
|
finally:
|
|
if cursor:
|
|
cursor.close()
|
|
|
|
def track_invalid_ticker(connection, ticker: str, reason: str):
|
|
cursor = None
|
|
try:
|
|
cursor = connection.cursor()
|
|
cursor.execute("""
|
|
INSERT INTO invalid_tickers (ticker, reason)
|
|
VALUES (%s, %s)
|
|
ON CONFLICT (ticker)
|
|
DO UPDATE SET
|
|
attempts = invalid_tickers.attempts + 1,
|
|
last_check = CURRENT_TIMESTAMP
|
|
""", (ticker, reason))
|
|
connection.commit()
|
|
finally:
|
|
if cursor:
|
|
cursor.close()
|