299 lines
11 KiB
Python
299 lines
11 KiB
Python
"""
|
|
Complete example showing how to use the OHLCVPredictor for making predictions.
|
|
This example demonstrates:
|
|
1. Loading a trained model
|
|
2. Preparing sample OHLCV data
|
|
3. Making log return predictions
|
|
4. Making price predictions
|
|
5. Evaluating and displaying results
|
|
"""
|
|
|
|
import os
|
|
import pandas as pd
|
|
import numpy as np
|
|
from datetime import datetime, timedelta
|
|
from predictor import OHLCVPredictor
|
|
|
|
def create_sample_ohlcv_data(num_samples=200):
|
|
"""
|
|
Create realistic sample OHLCV data for demonstration.
|
|
In practice, replace this with your actual data loading.
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with OHLCV data
|
|
"""
|
|
print("Creating sample OHLCV data for demonstration...")
|
|
|
|
# Start with a base price and simulate realistic price movements
|
|
np.random.seed(42) # For reproducible results
|
|
base_price = 50000.0 # Base Bitcoin price
|
|
|
|
# Generate timestamps (1-minute intervals)
|
|
start_time = datetime(2024, 1, 1)
|
|
timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
|
|
|
|
# Generate realistic price movements
|
|
returns = np.random.normal(0, 0.001, num_samples) # Small random returns
|
|
prices = [base_price]
|
|
|
|
for i in range(1, num_samples):
|
|
# Add some trending behavior
|
|
trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend
|
|
price_change = returns[i] + trend
|
|
new_price = prices[-1] * (1 + price_change)
|
|
prices.append(max(new_price, 1000)) # Minimum price floor
|
|
|
|
# Generate OHLCV data
|
|
data = []
|
|
for i in range(num_samples):
|
|
price = prices[i]
|
|
|
|
# Generate realistic OHLC within a reasonable range
|
|
volatility = abs(np.random.normal(0, 0.002)) # Random volatility
|
|
high = price * (1 + volatility)
|
|
low = price * (1 - volatility)
|
|
|
|
# Ensure OHLC relationships are correct
|
|
open_price = price * (1 + np.random.normal(0, 0.0005))
|
|
close_price = price * (1 + np.random.normal(0, 0.0005))
|
|
|
|
# Ensure high is highest and low is lowest
|
|
high = max(high, open_price, close_price)
|
|
low = min(low, open_price, close_price)
|
|
|
|
# Generate volume (typically higher during price movements)
|
|
base_volume = 100 + abs(np.random.normal(0, 50))
|
|
volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
|
|
volume = base_volume * volume_multiplier
|
|
|
|
data.append({
|
|
'Timestamp': timestamps[i],
|
|
'Open': round(open_price, 2),
|
|
'High': round(high, 2),
|
|
'Low': round(low, 2),
|
|
'Close': round(close_price, 2),
|
|
'Volume': round(volume, 2)
|
|
})
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
# Calculate log returns (required by feature engineering)
|
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
|
|
|
print(f"Generated {len(df)} samples of OHLCV data")
|
|
print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
|
|
return df
|
|
|
|
def load_real_data_example():
|
|
"""
|
|
Example of how to load real OHLCV data.
|
|
Replace this with your actual data loading logic.
|
|
|
|
Returns:
|
|
pd.DataFrame or None: Real OHLCV data if available
|
|
"""
|
|
# Example paths where real data might be located
|
|
possible_paths = [
|
|
'../data/btcusd_1-min_data.csv',
|
|
'../data/sample_data.csv',
|
|
'data/crypto_data.csv'
|
|
]
|
|
|
|
for path in possible_paths:
|
|
if os.path.exists(path):
|
|
print(f"Loading real data from {path}...")
|
|
try:
|
|
df = pd.read_csv(path)
|
|
# Ensure required columns exist
|
|
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
|
|
if all(col in df.columns for col in required_cols):
|
|
# Filter out zero volume entries and calculate log returns
|
|
df = df[df['Volume'] != 0].reset_index(drop=True)
|
|
# Use only recent data and ensure proper data types
|
|
df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering
|
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
|
print(f"Successfully loaded {len(df)} rows of real data")
|
|
return df.tail(200) # Use last 200 for final processing
|
|
else:
|
|
print(f"Missing required columns in {path}")
|
|
except Exception as e:
|
|
print(f"Error loading {path}: {e}")
|
|
|
|
return None
|
|
|
|
def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
|
|
"""
|
|
Display prediction results in a readable format.
|
|
|
|
Args:
|
|
df: Original OHLCV DataFrame
|
|
log_return_preds: Array of log return predictions
|
|
predicted_prices: Array of predicted prices (optional)
|
|
actual_prices: Array of actual prices (optional)
|
|
"""
|
|
print("\n" + "="*60)
|
|
print("PREDICTION RESULTS")
|
|
print("="*60)
|
|
|
|
# Convert timestamps back to readable format for display
|
|
df_display = df.copy()
|
|
df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
|
|
|
|
print(f"\nLog Return Predictions (first 10):")
|
|
print("-" * 40)
|
|
for i in range(min(10, len(log_return_preds))):
|
|
timestamp = df_display.iloc[i]['Timestamp']
|
|
close_price = df_display.iloc[i]['Close']
|
|
log_ret = log_return_preds[i]
|
|
direction = "UP" if log_ret > 0 else "DOWN"
|
|
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
|
f"Close: ${close_price:8.2f} | "
|
|
f"Log Return: {log_ret:8.6f} | "
|
|
f"Direction: {direction}")
|
|
|
|
if predicted_prices is not None and actual_prices is not None:
|
|
print(f"\nPrice Predictions vs Actual (first 10):")
|
|
print("-" * 50)
|
|
for i in range(min(10, len(predicted_prices))):
|
|
timestamp = df_display.iloc[i]['Timestamp']
|
|
pred_price = predicted_prices[i]
|
|
actual_price = actual_prices[i]
|
|
error = abs(pred_price - actual_price)
|
|
error_pct = (error / actual_price) * 100
|
|
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
|
f"Predicted: ${pred_price:8.2f} | "
|
|
f"Actual: ${actual_price:8.2f} | "
|
|
f"Error: {error_pct:5.2f}%")
|
|
|
|
# Statistics
|
|
print(f"\nPrediction Statistics:")
|
|
print("-" * 30)
|
|
print(f"Total predictions: {len(log_return_preds)}")
|
|
print(f"Mean log return: {np.mean(log_return_preds):.6f}")
|
|
print(f"Std log return: {np.std(log_return_preds):.6f}")
|
|
print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
|
|
print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
|
|
|
|
if predicted_prices is not None and actual_prices is not None:
|
|
mae = np.mean(np.abs(predicted_prices - actual_prices))
|
|
mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
|
|
print(f"\nPrice Prediction Accuracy:")
|
|
print(f"Mean Absolute Error: ${mae:.2f}")
|
|
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
|
|
|
|
def demonstrate_batch_prediction(predictor, df):
|
|
"""
|
|
Demonstrate batch prediction on multiple data chunks.
|
|
|
|
Args:
|
|
predictor: OHLCVPredictor instance
|
|
df: OHLCV DataFrame
|
|
"""
|
|
print("\n" + "="*60)
|
|
print("BATCH PREDICTION DEMONSTRATION")
|
|
print("="*60)
|
|
|
|
chunk_size = 50
|
|
num_chunks = min(3, len(df) // chunk_size)
|
|
|
|
for i in range(num_chunks):
|
|
start_idx = i * chunk_size
|
|
end_idx = start_idx + chunk_size
|
|
chunk_df = df.iloc[start_idx:end_idx].copy()
|
|
|
|
print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
|
|
|
|
try:
|
|
log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
|
|
print(f"Successfully predicted {len(log_return_preds)} log returns")
|
|
print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
|
|
|
|
except Exception as e:
|
|
print(f"Error in batch {i+1}: {e}")
|
|
|
|
def main():
|
|
"""
|
|
Main function demonstrating complete OHLCVPredictor usage.
|
|
"""
|
|
model_path = '../data/xgboost_model_all_features.json'
|
|
|
|
# Check if model exists
|
|
if not os.path.exists(model_path):
|
|
print("Model not found. Run main.py first to train the model.")
|
|
print(f"Expected model path: {model_path}")
|
|
return
|
|
|
|
try:
|
|
# Load predictor
|
|
print("Loading predictor...")
|
|
predictor = OHLCVPredictor(model_path)
|
|
print("Predictor loaded successfully!")
|
|
|
|
# Try to load real data first, fall back to synthetic data
|
|
df = load_real_data_example()
|
|
if df is None:
|
|
df = create_sample_ohlcv_data(200)
|
|
|
|
print(f"\nDataFrame shape: {df.shape}")
|
|
print(f"Columns: {list(df.columns)}")
|
|
print(f"Data range: {len(df)} samples")
|
|
|
|
# Demonstrate log return predictions
|
|
print("\n" + "="*60)
|
|
print("LOG RETURN PREDICTIONS")
|
|
print("="*60)
|
|
|
|
log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
|
|
print(f"Generated {len(log_return_preds)} log return predictions")
|
|
|
|
# Demonstrate price predictions
|
|
print("\n" + "="*60)
|
|
print("PRICE PREDICTIONS")
|
|
print("="*60)
|
|
|
|
predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
|
|
print(f"Generated {len(predicted_prices)} price predictions")
|
|
|
|
# Display results
|
|
display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
|
|
|
|
# Demonstrate batch processing
|
|
demonstrate_batch_prediction(predictor, df)
|
|
|
|
print("\n" + "="*60)
|
|
print("USAGE EXAMPLES FOR OTHER PROJECTS")
|
|
print("="*60)
|
|
print("""
|
|
# Basic usage:
|
|
from predictor import OHLCVPredictor
|
|
|
|
# Load your trained model
|
|
predictor = OHLCVPredictor('path/to/your/model.json')
|
|
|
|
# Prepare your OHLCV data (pandas DataFrame with columns):
|
|
# ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
|
|
|
|
# Get log return predictions
|
|
log_returns = predictor.predict(your_dataframe)
|
|
|
|
# Get price predictions
|
|
predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
|
|
|
|
# Required files for deployment:
|
|
# - predictor.py
|
|
# - custom_xgboost.py
|
|
# - feature_engineering.py
|
|
# - technical_indicator_functions.py
|
|
# - your_trained_model.json
|
|
""")
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"File not found: {e}")
|
|
print("Make sure the model file exists and the path is correct.")
|
|
|
|
except Exception as e:
|
|
print(f"Error during prediction: {e}")
|
|
print("Check your data format and model compatibility.")
|
|
|
|
if __name__ == '__main__':
|
|
main() |