lowkey_backtest/train_model.py

130 lines
4.7 KiB
Python

import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import strategy_config as config
def train_model():
print(f"--- Starting Model Training Pipeline ---")
try:
if not os.path.exists(config.FEATURES_PATH):
print(f"Error: {config.FEATURES_PATH} not found. Run prepare_data.py first.")
return
df = pd.read_csv(config.FEATURES_PATH)
# Ensure index if needed, but read_csv usually reads generic index unless specified
# prepare_data saved with index (timestamp)
if 'timestamp' in df.columns:
df = df.set_index('timestamp')
print(f"Loaded {len(df)} data points from {config.FEATURES_PATH}")
y = df['target']
print(f"Buy signals rate: {y.mean():.1%}")
# Use the dynamic feature list directly from config.py
# Check if all features exist
available_feats = [f for f in config.FEATURE_NAMES if f in df.columns]
missing_feats = [f for f in config.FEATURE_NAMES if f not in df.columns]
if missing_feats:
print(f"⚠️ Missing features: {missing_feats}")
print(f"Proceeding with {len(available_feats)} features.")
X = df[available_feats]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, shuffle=False
)
# Save the test set start index for the backtester to use
# This prevents train/test leakage during backtesting
test_start_idx = len(X_train)
test_start_timestamp = df.index[test_start_idx] if hasattr(df.index, '__getitem__') else test_start_idx
# Save split info
split_info = {
'test_start_idx': test_start_idx,
'test_start_timestamp': str(test_start_timestamp),
'train_size': len(X_train),
'test_size': len(X_test)
}
split_info_path = config.MODEL_PATH.replace('.pkl', '_split.pkl')
with open(split_info_path, 'wb') as f:
pickle.dump(split_info, f)
print(f"Split info saved: Test starts at index {test_start_idx} ({test_start_timestamp})")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("\nTraining XGBoost model...")
model = xgb.XGBClassifier(
objective='binary:logistic',
eval_metric='logloss',
n_estimators=200,
learning_rate=0.05,
scale_pos_weight=8.0,
max_depth=5,
subsample=0.8,
random_state=42,
early_stopping_rounds=10
)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False
)
print("Model training complete.")
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Hold/Sell (0)', 'Buy (1)']))
print("\n--- Probability Threshold Analysis ---")
thresholds = [0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
for thresh in thresholds:
pred_at_thresh = (y_proba >= thresh).astype(int)
if pred_at_thresh.sum() > 0:
precision, recall, f1, _ = precision_recall_fscore_support(
y_test, pred_at_thresh, average='binary', zero_division=0
)
signal_rate = pred_at_thresh.mean() * 100
print(f" Thresh {thresh:.2f}: Precision={precision:.2f}, Recall={recall:.2f}, "
f"F1={f1:.2f}, Signals={signal_rate:.1f}%")
else:
print(f" Thresh {thresh:.2f}: No signals generated")
with open(config.MODEL_PATH, 'wb') as f:
pickle.dump(model, f)
print(f"\nSUCCESS: Model saved to {config.MODEL_PATH}")
# Feature Importance
importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Features:")
print(importance.head(10))
except Exception as e:
print(f"AN ERROR OCCURRED: {e}")
import traceback
traceback.print_exc()
import os
if __name__ == "__main__":
train_model()