From cea43c4d3267cbaccb4ad3e543a441b26c1c57cd Mon Sep 17 00:00:00 2001 From: "Bobby (aider)" Date: Sat, 8 Feb 2025 20:03:59 -0800 Subject: [PATCH] feat: Add time interval resampling to stock data fetching --- src/utils/data_utils.py | 104 +++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 59 deletions(-) diff --git a/src/utils/data_utils.py b/src/utils/data_utils.py index 9e7d829..7c0c9cf 100644 --- a/src/utils/data_utils.py +++ b/src/utils/data_utils.py @@ -61,30 +61,24 @@ def save_signals_to_csv(signals: list, scanner_name: str) -> None: def get_stock_data(ticker: str, start_date: datetime, end_date: datetime, interval: str) -> pd.DataFrame: """ - Fetch stock data from the database with enhanced fallback logic + Fetch and resample stock data based on the chosen interval Args: ticker (str): Stock ticker symbol start_date (datetime): Start date for data fetch end_date (datetime): End date for data fetch - interval (str): Time interval for data ('daily', '5min', etc.) + interval (str): Time interval for data ('daily', '5min', '15min', '30min', '1hour') Returns: - pd.DataFrame: DataFrame with OHLCV data + pd.DataFrame: Resampled DataFrame with OHLCV data """ try: client = create_client() - # Expand window to 90 days for more data robustness + # Expand window to get enough data for calculations start_date = start_date - timedelta(days=90) - # First try primary data source - if interval == "daily": - table = "stock_prices_daily" - else: - table = "stock_prices" - - # Unified query format + # Base query to get raw data at finest granularity query = f""" SELECT toDateTime(window_start/1000000000) as date, @@ -104,50 +98,10 @@ def get_stock_data(ticker: str, start_date: datetime, end_date: datetime, interv result = client.query(query) - # Fallback to intraday data if needed - if not result.result_rows and interval == "daily": - print(f"⚠️ No daily data for {ticker}, resampling from intraday data") - intraday_query = f""" - SELECT - toDateTime(window_start/1000000000) as date, - first_value(open) AS open, - max(high) AS high, - min(low) AS low, - last_value(close) AS close, - sum(volume) AS volume - FROM stock_db.stock_prices - WHERE ticker = '{ticker}' - AND window_start BETWEEN - {int(start_date.timestamp() * 1e9)} AND - {int(end_date.timestamp() * 1e9)} - AND toYear(toDateTime(window_start/1000000000)) <= toYear(now()) - AND toYear(toDateTime(window_start/1000000000)) >= (toYear(now()) - 1) - GROUP BY date - ORDER BY date ASC - """ - result = client.query(intraday_query) - - # Fallback to different intervals if still empty - if not result.result_rows: - print(f"⚠️ No {interval} data for {ticker}, trying weekly") - weekly_query = f""" - SELECT - toStartOfWeek(window_start) AS date, - first_value(open) AS open, - max(high) AS high, - min(low) AS low, - last_value(close) AS close, - sum(volume) AS volume - FROM stock_db.stock_prices - WHERE ticker = '{ticker}' - GROUP BY date - ORDER BY date ASC - """ - result = client.query(weekly_query) - if not result.result_rows: return pd.DataFrame() + # Create base DataFrame df = pd.DataFrame( result.result_rows, columns=['date', 'open', 'high', 'low', 'close', 'volume'] @@ -158,18 +112,50 @@ def get_stock_data(ticker: str, start_date: datetime, end_date: datetime, interv for col in numeric_columns: df[col] = pd.to_numeric(df[col], errors='coerce') + # Convert date column + df['date'] = pd.to_datetime(df['date']) + + # Set date as index for resampling + df.set_index('date', inplace=True) + + # Resample based on interval + if interval == 'daily': + rule = '1D' + elif interval == '5min': + rule = '5T' + elif interval == '15min': + rule = '15T' + elif interval == '30min': + rule = '30T' + elif interval == '1hour': + rule = '1H' + else: + rule = '1D' # Default to daily + + resampled = df.resample(rule).agg({ + 'open': 'first', + 'high': 'max', + 'low': 'min', + 'close': 'last', + 'volume': 'sum' + }).dropna() + + # Reset index to get date as column + resampled.reset_index(inplace=True) + + # Filter to requested date range + mask = (resampled['date'] >= start_date + timedelta(days=89)) & (resampled['date'] <= end_date) + resampled = resampled.loc[mask] + # Handle null values - if df['close'].isnull().any(): + if resampled['close'].isnull().any(): print(f"Warning: Found null values in close prices") - df = df.dropna(subset=['close']) + resampled = resampled.dropna(subset=['close']) - if df.empty or 'close' not in df.columns: + if resampled.empty or 'close' not in resampled.columns: return pd.DataFrame() - if df['date'].dtype == object: - df['date'] = pd.to_datetime(df['date']) - - return df + return resampled except Exception as e: print(f"Error fetching {ticker} data: {str(e)}")