""" Complete example showing how to use the OHLCVPredictor for making predictions. This example demonstrates: 1. Loading a trained model 2. Preparing sample OHLCV data 3. Making log return predictions 4. Making price predictions 5. Evaluating and displaying results """ import os import pandas as pd import numpy as np from datetime import datetime, timedelta from predictor import OHLCVPredictor def create_sample_ohlcv_data(num_samples=200): """ Create realistic sample OHLCV data for demonstration. In practice, replace this with your actual data loading. Returns: pd.DataFrame: DataFrame with OHLCV data """ print("Creating sample OHLCV data for demonstration...") # Start with a base price and simulate realistic price movements np.random.seed(42) # For reproducible results base_price = 50000.0 # Base Bitcoin price # Generate timestamps (1-minute intervals) start_time = datetime(2024, 1, 1) timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)] # Generate realistic price movements returns = np.random.normal(0, 0.001, num_samples) # Small random returns prices = [base_price] for i in range(1, num_samples): # Add some trending behavior trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend price_change = returns[i] + trend new_price = prices[-1] * (1 + price_change) prices.append(max(new_price, 1000)) # Minimum price floor # Generate OHLCV data data = [] for i in range(num_samples): price = prices[i] # Generate realistic OHLC within a reasonable range volatility = abs(np.random.normal(0, 0.002)) # Random volatility high = price * (1 + volatility) low = price * (1 - volatility) # Ensure OHLC relationships are correct open_price = price * (1 + np.random.normal(0, 0.0005)) close_price = price * (1 + np.random.normal(0, 0.0005)) # Ensure high is highest and low is lowest high = max(high, open_price, close_price) low = min(low, open_price, close_price) # Generate volume (typically higher during price movements) base_volume = 100 + abs(np.random.normal(0, 50)) volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10 volume = base_volume * volume_multiplier data.append({ 'Timestamp': timestamps[i], 'Open': round(open_price, 2), 'High': round(high, 2), 'Low': round(low, 2), 'Close': round(close_price, 2), 'Volume': round(volume, 2) }) df = pd.DataFrame(data) # Calculate log returns (required by feature engineering) df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) print(f"Generated {len(df)} samples of OHLCV data") print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}") return df def load_real_data_example(): """ Example of how to load real OHLCV data. Replace this with your actual data loading logic. Returns: pd.DataFrame or None: Real OHLCV data if available """ # Example paths where real data might be located possible_paths = [ '../data/btcusd_1-min_data.csv', '../data/sample_data.csv', 'data/crypto_data.csv' ] for path in possible_paths: if os.path.exists(path): print(f"Loading real data from {path}...") try: df = pd.read_csv(path) # Ensure required columns exist required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp'] if all(col in df.columns for col in required_cols): # Filter out zero volume entries and calculate log returns df = df[df['Volume'] != 0].reset_index(drop=True) # Use only recent data and ensure proper data types df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) print(f"Successfully loaded {len(df)} rows of real data") return df.tail(200) # Use last 200 for final processing else: print(f"Missing required columns in {path}") except Exception as e: print(f"Error loading {path}: {e}") return None def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None): """ Display prediction results in a readable format. Args: df: Original OHLCV DataFrame log_return_preds: Array of log return predictions predicted_prices: Array of predicted prices (optional) actual_prices: Array of actual prices (optional) """ print("\n" + "="*60) print("PREDICTION RESULTS") print("="*60) # Convert timestamps back to readable format for display df_display = df.copy() df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s') print(f"\nLog Return Predictions (first 10):") print("-" * 40) for i in range(min(10, len(log_return_preds))): timestamp = df_display.iloc[i]['Timestamp'] close_price = df_display.iloc[i]['Close'] log_ret = log_return_preds[i] direction = "UP" if log_ret > 0 else "DOWN" print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | " f"Close: ${close_price:8.2f} | " f"Log Return: {log_ret:8.6f} | " f"Direction: {direction}") if predicted_prices is not None and actual_prices is not None: print(f"\nPrice Predictions vs Actual (first 10):") print("-" * 50) for i in range(min(10, len(predicted_prices))): timestamp = df_display.iloc[i]['Timestamp'] pred_price = predicted_prices[i] actual_price = actual_prices[i] error = abs(pred_price - actual_price) error_pct = (error / actual_price) * 100 print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | " f"Predicted: ${pred_price:8.2f} | " f"Actual: ${actual_price:8.2f} | " f"Error: {error_pct:5.2f}%") # Statistics print(f"\nPrediction Statistics:") print("-" * 30) print(f"Total predictions: {len(log_return_preds)}") print(f"Mean log return: {np.mean(log_return_preds):.6f}") print(f"Std log return: {np.std(log_return_preds):.6f}") print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)") print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)") if predicted_prices is not None and actual_prices is not None: mae = np.mean(np.abs(predicted_prices - actual_prices)) mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100 print(f"\nPrice Prediction Accuracy:") print(f"Mean Absolute Error: ${mae:.2f}") print(f"Mean Absolute Percentage Error: {mape:.2f}%") def demonstrate_batch_prediction(predictor, df): """ Demonstrate batch prediction on multiple data chunks. Args: predictor: OHLCVPredictor instance df: OHLCV DataFrame """ print("\n" + "="*60) print("BATCH PREDICTION DEMONSTRATION") print("="*60) chunk_size = 50 num_chunks = min(3, len(df) // chunk_size) for i in range(num_chunks): start_idx = i * chunk_size end_idx = start_idx + chunk_size chunk_df = df.iloc[start_idx:end_idx].copy() print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...") try: log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}') print(f"Successfully predicted {len(log_return_preds)} log returns") print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}") except Exception as e: print(f"Error in batch {i+1}: {e}") def main(): """ Main function demonstrating complete OHLCVPredictor usage. """ model_path = '../data/xgboost_model_all_features.json' # Check if model exists if not os.path.exists(model_path): print("Model not found. Run main.py first to train the model.") print(f"Expected model path: {model_path}") return try: # Load predictor print("Loading predictor...") predictor = OHLCVPredictor(model_path) print("Predictor loaded successfully!") # Try to load real data first, fall back to synthetic data df = load_real_data_example() if df is None: df = create_sample_ohlcv_data(200) print(f"\nDataFrame shape: {df.shape}") print(f"Columns: {list(df.columns)}") print(f"Data range: {len(df)} samples") # Demonstrate log return predictions print("\n" + "="*60) print("LOG RETURN PREDICTIONS") print("="*60) log_return_preds = predictor.predict(df, csv_prefix='inference_demo') print(f"Generated {len(log_return_preds)} log return predictions") # Demonstrate price predictions print("\n" + "="*60) print("PRICE PREDICTIONS") print("="*60) predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo') print(f"Generated {len(predicted_prices)} price predictions") # Display results display_prediction_results(df, log_return_preds, predicted_prices, actual_prices) # Demonstrate batch processing demonstrate_batch_prediction(predictor, df) print("\n" + "="*60) print("USAGE EXAMPLES FOR OTHER PROJECTS") print("="*60) print(""" # Basic usage: from predictor import OHLCVPredictor # Load your trained model predictor = OHLCVPredictor('path/to/your/model.json') # Prepare your OHLCV data (pandas DataFrame with columns): # ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'] # Get log return predictions log_returns = predictor.predict(your_dataframe) # Get price predictions predicted_prices, actual_prices = predictor.predict_prices(your_dataframe) # Required files for deployment: # - predictor.py # - custom_xgboost.py # - feature_engineering.py # - technical_indicator_functions.py # - your_trained_model.json """) except FileNotFoundError as e: print(f"File not found: {e}") print("Make sure the model file exists and the path is correct.") except Exception as e: print(f"Error during prediction: {e}") print("Check your data format and model compatibility.") if __name__ == '__main__': main()