OHLCVPredictor/inference_example.py

299 lines
11 KiB
Python

"""
Complete example showing how to use the OHLCVPredictor for making predictions.
This example demonstrates:
1. Loading a trained model
2. Preparing sample OHLCV data
3. Making log return predictions
4. Making price predictions
5. Evaluating and displaying results
"""
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from predictor import OHLCVPredictor
def create_sample_ohlcv_data(num_samples=200):
"""
Create realistic sample OHLCV data for demonstration.
In practice, replace this with your actual data loading.
Returns:
pd.DataFrame: DataFrame with OHLCV data
"""
print("Creating sample OHLCV data for demonstration...")
# Start with a base price and simulate realistic price movements
np.random.seed(42) # For reproducible results
base_price = 50000.0 # Base Bitcoin price
# Generate timestamps (1-minute intervals)
start_time = datetime(2024, 1, 1)
timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
# Generate realistic price movements
returns = np.random.normal(0, 0.001, num_samples) # Small random returns
prices = [base_price]
for i in range(1, num_samples):
# Add some trending behavior
trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend
price_change = returns[i] + trend
new_price = prices[-1] * (1 + price_change)
prices.append(max(new_price, 1000)) # Minimum price floor
# Generate OHLCV data
data = []
for i in range(num_samples):
price = prices[i]
# Generate realistic OHLC within a reasonable range
volatility = abs(np.random.normal(0, 0.002)) # Random volatility
high = price * (1 + volatility)
low = price * (1 - volatility)
# Ensure OHLC relationships are correct
open_price = price * (1 + np.random.normal(0, 0.0005))
close_price = price * (1 + np.random.normal(0, 0.0005))
# Ensure high is highest and low is lowest
high = max(high, open_price, close_price)
low = min(low, open_price, close_price)
# Generate volume (typically higher during price movements)
base_volume = 100 + abs(np.random.normal(0, 50))
volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
volume = base_volume * volume_multiplier
data.append({
'Timestamp': timestamps[i],
'Open': round(open_price, 2),
'High': round(high, 2),
'Low': round(low, 2),
'Close': round(close_price, 2),
'Volume': round(volume, 2)
})
df = pd.DataFrame(data)
# Calculate log returns (required by feature engineering)
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
print(f"Generated {len(df)} samples of OHLCV data")
print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
return df
def load_real_data_example():
"""
Example of how to load real OHLCV data.
Replace this with your actual data loading logic.
Returns:
pd.DataFrame or None: Real OHLCV data if available
"""
# Example paths where real data might be located
possible_paths = [
'../data/btcusd_1-min_data.csv',
'../data/sample_data.csv',
'data/crypto_data.csv'
]
for path in possible_paths:
if os.path.exists(path):
print(f"Loading real data from {path}...")
try:
df = pd.read_csv(path)
# Ensure required columns exist
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
if all(col in df.columns for col in required_cols):
# Filter out zero volume entries and calculate log returns
df = df[df['Volume'] != 0].reset_index(drop=True)
# Use only recent data and ensure proper data types
df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
print(f"Successfully loaded {len(df)} rows of real data")
return df.tail(200) # Use last 200 for final processing
else:
print(f"Missing required columns in {path}")
except Exception as e:
print(f"Error loading {path}: {e}")
return None
def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
"""
Display prediction results in a readable format.
Args:
df: Original OHLCV DataFrame
log_return_preds: Array of log return predictions
predicted_prices: Array of predicted prices (optional)
actual_prices: Array of actual prices (optional)
"""
print("\n" + "="*60)
print("PREDICTION RESULTS")
print("="*60)
# Convert timestamps back to readable format for display
df_display = df.copy()
df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
print(f"\nLog Return Predictions (first 10):")
print("-" * 40)
for i in range(min(10, len(log_return_preds))):
timestamp = df_display.iloc[i]['Timestamp']
close_price = df_display.iloc[i]['Close']
log_ret = log_return_preds[i]
direction = "UP" if log_ret > 0 else "DOWN"
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
f"Close: ${close_price:8.2f} | "
f"Log Return: {log_ret:8.6f} | "
f"Direction: {direction}")
if predicted_prices is not None and actual_prices is not None:
print(f"\nPrice Predictions vs Actual (first 10):")
print("-" * 50)
for i in range(min(10, len(predicted_prices))):
timestamp = df_display.iloc[i]['Timestamp']
pred_price = predicted_prices[i]
actual_price = actual_prices[i]
error = abs(pred_price - actual_price)
error_pct = (error / actual_price) * 100
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
f"Predicted: ${pred_price:8.2f} | "
f"Actual: ${actual_price:8.2f} | "
f"Error: {error_pct:5.2f}%")
# Statistics
print(f"\nPrediction Statistics:")
print("-" * 30)
print(f"Total predictions: {len(log_return_preds)}")
print(f"Mean log return: {np.mean(log_return_preds):.6f}")
print(f"Std log return: {np.std(log_return_preds):.6f}")
print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
if predicted_prices is not None and actual_prices is not None:
mae = np.mean(np.abs(predicted_prices - actual_prices))
mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
print(f"\nPrice Prediction Accuracy:")
print(f"Mean Absolute Error: ${mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
def demonstrate_batch_prediction(predictor, df):
"""
Demonstrate batch prediction on multiple data chunks.
Args:
predictor: OHLCVPredictor instance
df: OHLCV DataFrame
"""
print("\n" + "="*60)
print("BATCH PREDICTION DEMONSTRATION")
print("="*60)
chunk_size = 50
num_chunks = min(3, len(df) // chunk_size)
for i in range(num_chunks):
start_idx = i * chunk_size
end_idx = start_idx + chunk_size
chunk_df = df.iloc[start_idx:end_idx].copy()
print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
try:
log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
print(f"Successfully predicted {len(log_return_preds)} log returns")
print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
except Exception as e:
print(f"Error in batch {i+1}: {e}")
def main():
"""
Main function demonstrating complete OHLCVPredictor usage.
"""
model_path = '../data/xgboost_model_all_features.json'
# Check if model exists
if not os.path.exists(model_path):
print("Model not found. Run main.py first to train the model.")
print(f"Expected model path: {model_path}")
return
try:
# Load predictor
print("Loading predictor...")
predictor = OHLCVPredictor(model_path)
print("Predictor loaded successfully!")
# Try to load real data first, fall back to synthetic data
df = load_real_data_example()
if df is None:
df = create_sample_ohlcv_data(200)
print(f"\nDataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Data range: {len(df)} samples")
# Demonstrate log return predictions
print("\n" + "="*60)
print("LOG RETURN PREDICTIONS")
print("="*60)
log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
print(f"Generated {len(log_return_preds)} log return predictions")
# Demonstrate price predictions
print("\n" + "="*60)
print("PRICE PREDICTIONS")
print("="*60)
predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
print(f"Generated {len(predicted_prices)} price predictions")
# Display results
display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
# Demonstrate batch processing
demonstrate_batch_prediction(predictor, df)
print("\n" + "="*60)
print("USAGE EXAMPLES FOR OTHER PROJECTS")
print("="*60)
print("""
# Basic usage:
from predictor import OHLCVPredictor
# Load your trained model
predictor = OHLCVPredictor('path/to/your/model.json')
# Prepare your OHLCV data (pandas DataFrame with columns):
# ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
# Get log return predictions
log_returns = predictor.predict(your_dataframe)
# Get price predictions
predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
# Required files for deployment:
# - predictor.py
# - custom_xgboost.py
# - feature_engineering.py
# - technical_indicator_functions.py
# - your_trained_model.json
""")
except FileNotFoundError as e:
print(f"File not found: {e}")
print("Make sure the model file exists and the path is correct.")
except Exception as e:
print(f"Error during prediction: {e}")
print("Check your data format and model compatibility.")
if __name__ == '__main__':
main()