import requests import logging from time import sleep from typing import List, Dict, Any, Optional from datetime import datetime class TranscriptCollector: def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://financialmodelingprep.com/api/v3" self.base_url_v4 = "https://financialmodelingprep.com/api/v4" self.rate_limit_pause = 0.5 # Increased from 0.2 self.stats = { 'requests': 0, 'successful': 0, 'failed': 0, 'empty': 0 } def _make_request(self, url: str, params: Dict, max_retries: int = 3) -> Any: """Make API request with retry logic.""" self.stats['requests'] += 1 for attempt in range(max_retries): try: sleep(self.rate_limit_pause * (attempt + 1)) # Incremental backoff response = requests.get(url, params=params) response.raise_for_status() if not response.content: self.stats['empty'] += 1 logging.warning(f"Empty response from {url}") return [] data = response.json() self.stats['successful'] += 1 return data except requests.exceptions.RequestException as e: self.stats['failed'] += 1 logging.error(f"API request error (attempt {attempt + 1}/{max_retries}) for {url}: {str(e)}") if attempt == max_retries - 1: # Last attempt return [] sleep(2 ** attempt) # Exponential backoff except Exception as e: self.stats['failed'] += 1 logging.error(f"Unexpected error for {url}: {str(e)}") return [] def get_transcript_dates(self, ticker: str) -> List[Dict]: """Get list of available transcript dates with validation.""" try: # Try v4 endpoint first params = {'apikey': self.api_key, 'symbol': ticker} response = self._make_request(f"{self.base_url_v4}/earning_call_transcript", params) valid_dates = [] if isinstance(response, (list, dict)): dates = [response] if isinstance(response, dict) else response for date_info in dates: try: year = int(date_info.get('year', 0)) quarter = int(date_info.get('quarter', 0)) if 1900 <= year <= datetime.now().year and 1 <= quarter <= 4: valid_dates.append(date_info) else: logging.warning(f"Invalid date info for {ticker}: {date_info}") except (TypeError, ValueError): continue return valid_dates except Exception as e: logging.error(f"Error getting transcript dates for {ticker}: {e}") return [] def get_transcript(self, ticker: str, year: int, quarter: int) -> Optional[Dict]: """Get specific earnings call transcript with fallback strategy.""" try: # Try batch endpoint first for better performance batch_transcripts = self.get_batch_transcripts(ticker, year) matching_transcript = next( (t for t in batch_transcripts if t.get('quarter') == quarter), None ) if matching_transcript: return matching_transcript # Fallback to individual transcript endpoint params = { 'apikey': self.api_key, 'year': year, 'quarter': quarter } response = self._make_request(f"{self.base_url}/earning_call_transcript/{ticker}", params) if isinstance(response, list) and response: return response[0] elif isinstance(response, dict): return response logging.warning(f"No transcript found for {ticker} {year}Q{quarter}") return None except Exception as e: logging.error(f"Error getting transcript for {ticker} {year}Q{quarter}: {e}") return None def get_batch_transcripts(self, ticker: str, year: int) -> List[Dict]: """Get all transcripts for a specific year with validation.""" try: params = {'apikey': self.api_key, 'year': year} response = self._make_request(f"{self.base_url_v4}/batch_earning_call_transcript/{ticker}", params) if isinstance(response, dict): return [response] elif isinstance(response, list): return [t for t in response if self._validate_transcript(t)] return [] except Exception as e: logging.error(f"Error getting batch transcripts for {ticker} {year}: {e}") return [] def _validate_transcript(self, transcript: Dict) -> bool: """Validate transcript data.""" try: if not transcript.get('content'): return False year = int(transcript.get('year', 0)) quarter = int(transcript.get('quarter', 0)) return (1900 <= year <= datetime.now().year and 1 <= quarter <= 4 and len(transcript['content'].strip()) > 100) except (TypeError, ValueError, KeyError): return False def get_stats(self) -> Dict: """Get collector statistics.""" return self.stats