OHLCVPredictor/inference_example.py

"""
Complete example showing how to use the OHLCVPredictor for making predictions.
This example demonstrates:
1. Loading a trained model
2. Preparing sample OHLCV data
3. Making log return predictions
4. Making price predictions
5. Evaluating and displaying results
"""

import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from predictor import OHLCVPredictor

def create_sample_ohlcv_data(num_samples=200):
    """
    Create realistic sample OHLCV data for demonstration.
    In practice, replace this with your actual data loading.

    Returns:
        pd.DataFrame: DataFrame with OHLCV data
    """
    print("Creating sample OHLCV data for demonstration...")

    # Start with a base price and simulate realistic price movements
    np.random.seed(42)  # For reproducible results
    base_price = 50000.0  # Base Bitcoin price

    # Generate timestamps (1-minute intervals)
    start_time = datetime(2024, 1, 1)
    timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]

    # Generate realistic price movements
    returns = np.random.normal(0, 0.001, num_samples)  # Small random returns
    prices = [base_price]

    for i in range(1, num_samples):
        # Add some trending behavior
        trend = 0.0001 * np.sin(i / 50.0)  # Gentle sinusoidal trend
        price_change = returns[i] + trend
        new_price = prices[-1] * (1 + price_change)
        prices.append(max(new_price, 1000))  # Minimum price floor

    # Generate OHLCV data
    data = []
    for i in range(num_samples):
        price = prices[i]

        # Generate realistic OHLC within a reasonable range
        volatility = abs(np.random.normal(0, 0.002))  # Random volatility
        high = price * (1 + volatility)
        low = price * (1 - volatility)

        # Ensure OHLC relationships are correct
        open_price = price * (1 + np.random.normal(0, 0.0005))
        close_price = price * (1 + np.random.normal(0, 0.0005))

        # Ensure high is highest and low is lowest
        high = max(high, open_price, close_price)
        low = min(low, open_price, close_price)

        # Generate volume (typically higher during price movements)
        base_volume = 100 + abs(np.random.normal(0, 50))
        volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
        volume = base_volume * volume_multiplier

        data.append({
            'Timestamp': timestamps[i],
            'Open': round(open_price, 2),
            'High': round(high, 2),
            'Low': round(low, 2),
            'Close': round(close_price, 2),
            'Volume': round(volume, 2)
        })

    df = pd.DataFrame(data)

    # Calculate log returns (required by feature engineering)
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

    print(f"Generated {len(df)} samples of OHLCV data")
    print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
    return df

def load_real_data_example():
    """
    Example of how to load real OHLCV data.
    Replace this with your actual data loading logic.

    Returns:
        pd.DataFrame or None: Real OHLCV data if available
    """
    # Example paths where real data might be located
    possible_paths = [
        '../data/btcusd_1-min_data.csv',
        '../data/sample_data.csv',
        'data/crypto_data.csv'
    ]

    for path in possible_paths:
        if os.path.exists(path):
            print(f"Loading real data from {path}...")
            try:
                df = pd.read_csv(path)
                # Ensure required columns exist
                required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
                if all(col in df.columns for col in required_cols):
                    # Filter out zero volume entries and calculate log returns
                    df = df[df['Volume'] != 0].reset_index(drop=True)
                    # Use only recent data and ensure proper data types
                    df = df.tail(500).reset_index(drop=True)  # Get more data for better feature engineering
                    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
                    print(f"Successfully loaded {len(df)} rows of real data")
                    return df.tail(200)  # Use last 200 for final processing
                else:
                    print(f"Missing required columns in {path}")
            except Exception as e:
                print(f"Error loading {path}: {e}")

    return None

def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
    """
    Display prediction results in a readable format.

    Args:
        df: Original OHLCV DataFrame
        log_return_preds: Array of log return predictions
        predicted_prices: Array of predicted prices (optional)
        actual_prices: Array of actual prices (optional)
    """
    print("\n" + "="*60)
    print("PREDICTION RESULTS")
    print("="*60)

    # Convert timestamps back to readable format for display
    df_display = df.copy()
    df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')

    print(f"\nLog Return Predictions (first 10):")
    print("-" * 40)
    for i in range(min(10, len(log_return_preds))):
        timestamp = df_display.iloc[i]['Timestamp']
        close_price = df_display.iloc[i]['Close']
        log_ret = log_return_preds[i]
        direction = "UP" if log_ret > 0 else "DOWN"
        print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
              f"Close: ${close_price:8.2f} | "
              f"Log Return: {log_ret:8.6f} | "
              f"Direction: {direction}")

    if predicted_prices is not None and actual_prices is not None:
        print(f"\nPrice Predictions vs Actual (first 10):")
        print("-" * 50)
        for i in range(min(10, len(predicted_prices))):
            timestamp = df_display.iloc[i]['Timestamp']
            pred_price = predicted_prices[i]
            actual_price = actual_prices[i]
            error = abs(pred_price - actual_price)
            error_pct = (error / actual_price) * 100
            print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
                  f"Predicted: ${pred_price:8.2f} | "
                  f"Actual: ${actual_price:8.2f} | "
                  f"Error: {error_pct:5.2f}%")

    # Statistics
    print(f"\nPrediction Statistics:")
    print("-" * 30)
    print(f"Total predictions: {len(log_return_preds)}")
    print(f"Mean log return: {np.mean(log_return_preds):.6f}")
    print(f"Std log return: {np.std(log_return_preds):.6f}")
    print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
    print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")

    if predicted_prices is not None and actual_prices is not None:
        mae = np.mean(np.abs(predicted_prices - actual_prices))
        mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
        print(f"\nPrice Prediction Accuracy:")
        print(f"Mean Absolute Error: ${mae:.2f}")
        print(f"Mean Absolute Percentage Error: {mape:.2f}%")

def demonstrate_batch_prediction(predictor, df):
    """
    Demonstrate batch prediction on multiple data chunks.

    Args:
        predictor: OHLCVPredictor instance
        df: OHLCV DataFrame
    """
    print("\n" + "="*60)
    print("BATCH PREDICTION DEMONSTRATION")
    print("="*60)

    chunk_size = 50
    num_chunks = min(3, len(df) // chunk_size)

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size
        chunk_df = df.iloc[start_idx:end_idx].copy()

        print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")

        try:
            log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
            print(f"Successfully predicted {len(log_return_preds)} log returns")
            print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")

        except Exception as e:
            print(f"Error in batch {i+1}: {e}")

def main():
    """
    Main function demonstrating complete OHLCVPredictor usage.
    """
    model_path = '../data/xgboost_model_all_features.json'

    # Check if model exists
    if not os.path.exists(model_path):
        print("Model not found. Run main.py first to train the model.")
        print(f"Expected model path: {model_path}")
        return

    try:
        # Load predictor
        print("Loading predictor...")
        predictor = OHLCVPredictor(model_path)
        print("Predictor loaded successfully!")

        # Try to load real data first, fall back to synthetic data
        df = load_real_data_example()
        if df is None:
            df = create_sample_ohlcv_data(200)

        print(f"\nDataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print(f"Data range: {len(df)} samples")

        # Demonstrate log return predictions
        print("\n" + "="*60)
        print("LOG RETURN PREDICTIONS")
        print("="*60)

        log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
        print(f"Generated {len(log_return_preds)} log return predictions")

        # Demonstrate price predictions
        print("\n" + "="*60)
        print("PRICE PREDICTIONS")
        print("="*60)

        predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
        print(f"Generated {len(predicted_prices)} price predictions")

        # Display results
        display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)

        # Demonstrate batch processing
        demonstrate_batch_prediction(predictor, df)

        print("\n" + "="*60)
        print("USAGE EXAMPLES FOR OTHER PROJECTS")
        print("="*60)
        print("""
                # Basic usage:
                from predictor import OHLCVPredictor

                # Load your trained model
                predictor = OHLCVPredictor('path/to/your/model.json')

                # Prepare your OHLCV data (pandas DataFrame with columns):
                # ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']

                # Get log return predictions
                log_returns = predictor.predict(your_dataframe)

                # Get price predictions
                predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)

                # Required files for deployment:
                # - predictor.py
                # - custom_xgboost.py
                # - feature_engineering.py
                # - technical_indicator_functions.py
                # - your_trained_model.json
        """)

    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Make sure the model file exists and the path is correct.")

    except Exception as e:
        print(f"Error during prediction: {e}")
        print("Check your data format and model compatibility.")

if __name__ == '__main__':
    main()