Refactor main script and introduce CLI for OHLCV Predictor. Consolidate functionality into a new package structure, enhancing modularity. Update README to reflect new features and usage instructions, including the requirement for a companion feature list JSON. Add configuration classes for better parameter management and streamline data loading and preprocessing.

2025-08-12 16:06:05 +08:00
parent 70da858aac
commit 289d11b0a8
14 changed files with 4361 additions and 327 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,23 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+
+        {
+            "name": "Python Debugger: main.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "main.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "--csv",
+                "../data/btcusd_1-min_data.csv",
+                "--min-date",
+                "2017-06-01"
+            ]
+        }
+    ]
+}
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ The training entrypoint is `main.py`:

 Outputs produced by training:
 - Model: `../data/xgboost_model_all_features.json`
+- Feature list: `../data/xgboost_model_all_features_features.json` (exact feature names and order used for training)
 - Results CSV: `../data/cumulative_feature_results.csv`
 - Charts: files under `charts/` (e.g., `all_features_prediction_error_distribution.html`)

@@ -90,6 +91,7 @@ Files needed to embed the predictor in another project:
 - `feature_engineering.py`
 - `technical_indicator_functions.py`
 - your trained model file (e.g., `xgboost_model_all_features.json`)
+ - the companion feature list JSON saved next to the model (same basename with `_features.json`)

 ## GPU/CPU notes

@@ -114,6 +116,7 @@ uv sync

 - KeyError: `'log_return'` during inference: ensure your input DataFrame includes `log_return` as described above.
 - Model file not found: confirm the path passed to `OHLCVPredictor(...)` matches where training saved it (default `../data/xgboost_model_all_features.json`).
+- Feature mismatch (e.g., XGBoost "Number of columns does not match"): ensure you use the model together with its companion feature list JSON. The predictor will automatically use it if present. If missing, retrain with the current code so the feature list is generated.
 - Empty/old charts: delete the `charts/` folder and rerun training.
 - Memory issues: consider downcasting or using smaller windows; the code already downcasts numerics where possible.

--- a/charts/all_features_prediction_error_distribution.html
+++ b/charts/all_features_prediction_error_distribution.html
--- a/main.py
+++ b/main.py
@@ -1,330 +1,7 @@
-import sys
-import os
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-import pandas as pd
-import numpy as np
-from custom_xgboost import CustomXGBoostGPU
-from evaluation import walk_forward_cv
-from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
-from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
-import time
-from numba import njit
-import csv
-from feature_engineering import feature_engineering
-from sklearn.feature_selection import VarianceThreshold
+from ohlcvpredictor.cli import main

-charts_dir = 'charts'
-if not os.path.exists(charts_dir):
-    os.makedirs(charts_dir)

-def run_indicator(func, *args):
-    return func(*args)
+if __name__ == "__main__":
+    main()

-def run_indicator_job(job):
-    import time
-    func, *args = job
-    indicator_name = func.__name__
-    start = time.time()
-    result = func(*args)
-    elapsed = time.time() - start
-    print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
-    return result

-if __name__ == '__main__':
-    IMPUTE_NANS = True  # Set to True to impute NaNs, False to drop rows with NaNs
-    csv_path = '../data/btcusd_1-min_data.csv'
-    csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
-
-    print('Reading CSV and filtering data...')
-    df = pd.read_csv(csv_path)
-    df = df[df['Volume'] != 0]
-
-    min_date = '2017-06-01'
-    print('Converting Timestamp and filtering by date...')
-    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
-    df = df[df['Timestamp'] >= min_date]
-
-    lags = 3
-
-    print('Calculating log returns as the new target...')
-    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
-
-    ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
-    window_sizes = [5, 15, 30]  # in minutes, adjust as needed
-
-    features_dict = {}
-
-    print('Starting feature computation...')
-    feature_start_time = time.time()
-    features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)
-    print('Concatenating all new features to DataFrame...')
-
-    features_df = pd.DataFrame(features_dict)
-    df = pd.concat([df, features_df], axis=1)
-
-    # feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
-    # if feature_cols_for_variance:
-    #     selector = VarianceThreshold(threshold=1e-5)
-    #     filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
-    #     kept_mask = selector.get_support()
-    #     kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
-    #     print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
-    #     # Only keep the selected features in features_df and df
-    #     features_df = features_df[kept_feature_names]
-    #     for col in feature_cols_for_variance:
-    #         if col not in kept_feature_names:
-    #             df.drop(col, axis=1, inplace=True)
-    # else:
-    #     print("No numeric features found for variance thresholding.")
-
-    # Remove highly correlated features (keep only one from each correlated group)
-    # corr_matrix = features_df.corr().abs()
-    # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
-    # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
-    # if to_drop:
-    #     print(f"Features removed due to high correlation: {to_drop}")
-    #     features_df = features_df.drop(columns=to_drop)
-    #     df = df.drop(columns=to_drop)
-    # else:
-    #     print("No highly correlated features found for removal.")
-
-    print('Downcasting float columns to save memory...')
-    for col in df.columns:
-        try:
-            df[col] = pd.to_numeric(df[col], downcast='float')
-        except Exception:
-            pass
-
-    # Add time features (exclude 'dayofweek')
-    print('Adding hour feature...')
-    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
-    df['hour'] = df['Timestamp'].dt.hour
-
-    # Handle NaNs after all feature engineering
-    if IMPUTE_NANS:
-        print('Imputing NaNs after feature engineering (using mean imputation)...')
-        numeric_cols = df.select_dtypes(include=[np.number]).columns
-        for col in numeric_cols:
-            df[col] = df[col].fillna(df[col].mean())
-        # If you want to impute non-numeric columns differently, add logic here
-    else:
-        print('Dropping NaNs after feature engineering...')
-        df = df.dropna().reset_index(drop=True)
-
-    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
-    print('Selecting feature columns...')
-    exclude_cols = ['Timestamp', 'Close']
-    exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] 
-    exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', 
-    'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', 
-    'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', 
-    'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', 
-    'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', 
-    'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', 
-    'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', 
-    'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', 
-    'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', 
-    'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', 
-    'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
-
-    feature_cols = [col for col in df.columns if col not in exclude_cols]
-    print('Features used for training:', feature_cols)
-
-    # from xgboost import XGBRegressor
-    # from sklearn.model_selection import GridSearchCV
-
-    # # Prepare data for grid search
-    # X = df[feature_cols].values.astype(np.float32)
-    # y = df["log_return"].values.astype(np.float32)
-    # split_idx = int(len(X) * 0.8)
-    # X_train, X_test = X[:split_idx], X[split_idx:]
-    # y_train, y_test = y[:split_idx], y[split_idx:]
-
-    # # Define parameter grid
-    # param_grid = {
-    #     'learning_rate': [0.01, 0.05, 0.1],
-    #     'max_depth': [3, 5, 7],
-    #     'n_estimators': [100, 200],
-    #     'subsample': [0.8, 1.0],
-    #     'colsample_bytree': [0.8, 1.0],
-    # }
-
-    # print('Starting grid search for XGBoost hyperparameters...')
-    # xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
-    # grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
-    # grid_search.fit(X_train, y_train)
-    # print('Best parameters found:', grid_search.best_params_)
-
-    # # Use best estimator for predictions
-    # best_model = grid_search.best_estimator_
-    # test_preds = best_model.predict(X_test)
-    # rmse = np.sqrt(mean_squared_error(y_test, test_preds))
-
-    # # Reconstruct price series from log returns
-    # if 'Close' in df.columns:
-    #     close_prices = df['Close'].values
-    # else:
-    #     close_prices = pd.read_csv(csv_path)['Close'].values
-    # start_price = close_prices[split_idx]
-    # actual_prices = [start_price]
-    # for r_ in y_test:
-    #     actual_prices.append(actual_prices[-1] * np.exp(r_))
-    # actual_prices = np.array(actual_prices[1:])
-    # predicted_prices = [start_price]
-    # for r_ in test_preds:
-    #     predicted_prices.append(predicted_prices[-1] * np.exp(r_))
-    # predicted_prices = np.array(predicted_prices[1:])
-
-    # mae = mean_absolute_error(actual_prices, predicted_prices)
-    # r2 = r2_score(actual_prices, predicted_prices)
-    # direction_actual = np.sign(np.diff(actual_prices))
-    # direction_pred = np.sign(np.diff(predicted_prices))
-    # directional_accuracy = (direction_actual == direction_pred).mean()
-    # mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
-
-    # print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')
-
-    # plot_prefix = f'all_features_gridsearch'
-    # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
-
-    # sys.exit(0)
-
-    # Prepare CSV for results
-    results_csv = '../data/cumulative_feature_results.csv'
-    if not os.path.exists(results_csv):
-        with open(results_csv, 'w', newline='') as f:
-            writer = csv.writer(f)
-            # Header aligned with what we actually write: feature, rmse, mape, r2, directional_accuracy, feature_importance
-            writer.writerow(['feature', 'rmse', 'mape', 'r2', 'directional_accuracy', 'feature_importance'])
-
-    try:
-        X = df[feature_cols].values.astype(np.float32)
-        y = df["log_return"].values.astype(np.float32)
-        split_idx = int(len(X) * 0.8)
-        X_train, X_test = X[:split_idx], X[split_idx:]
-        y_train, y_test = y[:split_idx], y[split_idx:]
-        test_timestamps = df['Timestamp'].values[split_idx:]
-
-        # Optional: walk-forward CV to obtain averaged importance for pruning
-        DO_WALK_FORWARD_CV = True
-        AUTO_PRUNE = True
-        TOP_K = 150  # keep top-K features by averaged importance
-
-        feature_importance_avg = None
-        if DO_WALK_FORWARD_CV:
-            print('Running walk-forward CV for metrics and averaged feature importance...')
-            metrics_avg, importance_avg = walk_forward_cv(X, y, feature_cols, n_splits=5)
-            print(f"CV metrics: RMSE={metrics_avg['rmse']:.6f}, MAPE={metrics_avg['mape']:.4f}%, R2={metrics_avg['r2']:.6f}, DirAcc={metrics_avg['dir_acc']:.4f}")
-            feature_importance_avg = importance_avg
-
-        # Auto-prune low-importance and known low-value features
-        prune_set = set()
-        # Known low-value from analysis
-        KNOWN_LOW = [
-            'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0',
-            'supertrend_trend_12_3.0', 'supertrend_trend_10_1.0', 'supertrend_trend_11_2.0',
-        ]
-        # Prefer cyclical hour; drop raw hour
-        KNOWN_LOW += ['hour']
-
-        if feature_importance_avg is not None:
-            # Keep top-K by averaged importance
-            sorted_feats = sorted(feature_importance_avg.items(), key=lambda kv: kv[1], reverse=True)
-            keep_names = set(name for name, _ in sorted_feats[:TOP_K])
-            for name in feature_cols:
-                if name not in keep_names:
-                    prune_set.add(name)
-
-        for name in KNOWN_LOW:
-            if name in feature_cols:
-                prune_set.add(name)
-
-        # Drop alternative vol estimators if Parkinson present at same window
-        for w in [5, 15, 30]:
-            park = f'park_vol_{w}'
-            gk = f'gk_vol_{w}'
-            rs = f'rs_vol_{w}'
-            yz = f'yz_vol_{w}'
-            if park in feature_cols:
-                for alt in [gk, rs, yz]:
-                    if alt in feature_cols:
-                        prune_set.add(alt)
-
-        # Apply pruning to training set
-        if AUTO_PRUNE and prune_set:
-            print(f'Pruning {len(prune_set)} features before training...')
-            kept_feature_cols = [c for c in feature_cols if c not in prune_set]
-        else:
-            kept_feature_cols = feature_cols
-
-        model = CustomXGBoostGPU(
-            df[kept_feature_cols].values.astype(np.float32)[:split_idx],
-            df[kept_feature_cols].values.astype(np.float32)[split_idx:],
-            y[:split_idx],
-            y[split_idx:]
-        )
-        booster = model.train(eval_metric='rmse')
-        #     colsample_bytree=1.0,
-        #     learning_rate=0.05,
-        #     max_depth=7,
-        #     n_estimators=200,
-        #     subsample=0.8
-        # )
-        model.save_model(f'../data/xgboost_model_all_features.json')
-
-        X_test_kept = df[kept_feature_cols].values.astype(np.float32)[split_idx:]
-        test_preds = model.predict(X_test_kept)
-        rmse = np.sqrt(mean_squared_error(y_test, test_preds))
-
-        # Reconstruct price series from log returns
-        if 'Close' in df.columns:
-            close_prices = df['Close'].values
-        else:
-            close_prices = pd.read_csv(csv_path)['Close'].values
-        start_price = close_prices[split_idx]
-        actual_prices = [start_price]
-        for r_ in y_test:
-            actual_prices.append(actual_prices[-1] * np.exp(r_))
-        actual_prices = np.array(actual_prices[1:])
-        predicted_prices = [start_price]
-        for r_ in test_preds:
-            predicted_prices.append(predicted_prices[-1] * np.exp(r_))
-        predicted_prices = np.array(predicted_prices[1:])
-
-        # mae = mean_absolute_error(actual_prices, predicted_prices)
-        r2 = r2_score(actual_prices, predicted_prices)
-        direction_actual = np.sign(np.diff(actual_prices))
-        direction_pred = np.sign(np.diff(predicted_prices))
-        directional_accuracy = (direction_actual == direction_pred).mean()
-        mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
-
-        # Save results to CSV for all features used in this run
-        feature_importance_dict = model.get_feature_importance(kept_feature_cols)
-        with open(results_csv, 'a', newline='') as f:
-            writer = csv.writer(f)
-            for feature in kept_feature_cols:
-                importance = feature_importance_dict.get(feature, 0.0)
-                fi_str = format(importance, ".6f")
-                row = [feature]
-                for val in [rmse, mape, r2, directional_accuracy]:
-                    if isinstance(val, float):
-                        row.append(format(val, '.10f'))
-                    else:
-                        row.append(val)
-                row.append(fi_str)
-                writer.writerow(row)
-        print('Feature importances and results saved for all features used in this run.')
-
-        # Plotting for this run
-        # plot_prefix = f'cumulative_{n}_features'
-        # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
-        # plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
-    except Exception as e:
-        print(f'Cumulative feature run failed: {e}')
-    print(f'All cumulative feature runs completed. Results saved to {results_csv}')
-
-    if 'predicted_prices' in locals() and 'actual_prices' in locals():
-        plot_prefix = f'all_features'
-        plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
-
-    sys.exit(0)
--- a/ohlcvpredictor/init.py
+++ b/ohlcvpredictor/init.py
@@ -0,0 +1,14 @@
+"""OHLCV Predictor package."""
+
+__all__ = [
+    "config",
+    "data",
+    "preprocess",
+    "selection",
+    "metrics",
+    "model",
+    "pipeline",
+    "cli",
+]
+
+
--- a/ohlcvpredictor/cli.py
+++ b/ohlcvpredictor/cli.py
@@ -0,0 +1,29 @@
+import argparse
+from .config import RunConfig, DataConfig
+from .pipeline import run_pipeline
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="OHLCV Predictor Pipeline")
+    p.add_argument("--csv", dest="csv_path", required=False, default="../data/btcusd_1-min_data.csv")
+    p.add_argument("--min-date", dest="min_date", required=False, default="2017-06-01")
+    p.add_argument("--max-date", dest="max_date", required=False, default=None)
+    return p
+
+
+def main() -> None:
+    parser = build_arg_parser()
+    args = parser.parse_args()
+    run_cfg = RunConfig(
+        data=DataConfig(csv_path=args.csv_path, min_date=args.min_date, max_date=args.max_date)
+    )
+    metrics = run_pipeline(run_cfg)
+    print(
+        f"RMSE={metrics['rmse']:.6f}, MAPE={metrics['mape']:.4f}%, R2={metrics['r2']:.6f}, DirAcc={metrics['directional_accuracy']:.4f}"
+    )
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/ohlcvpredictor/config.py
+++ b/ohlcvpredictor/config.py
@@ -0,0 +1,65 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class DataConfig:
+    """Configuration for data loading and basic filtering."""
+    csv_path: str
+    min_date: str = "2017-06-01"
+    max_date: Optional[str] = None
+    drop_volume_zero: bool = True
+
+
+@dataclass
+class FeatureConfig:
+    """Configuration for feature engineering."""
+    ohlcv_cols: List[str] = field(default_factory=lambda: ["Open", "High", "Low", "Close", "Volume"])
+    lags: int = 3
+    window_sizes: List[int] = field(default_factory=lambda: [5, 15, 30])
+
+
+@dataclass
+class PreprocessConfig:
+    """Configuration for preprocessing and NaN handling."""
+    impute_nans: bool = True
+
+
+@dataclass
+class PruningConfig:
+    """Configuration for feature pruning and CV."""
+    do_walk_forward_cv: bool = True
+    n_splits: int = 5
+    auto_prune: bool = True
+    top_k: int = 150
+    known_low_features: List[str] = field(
+        default_factory=lambda: [
+            "supertrend_12_3.0",
+            "supertrend_10_1.0",
+            "supertrend_11_2.0",
+            "supertrend_trend_12_3.0",
+            "supertrend_trend_10_1.0",
+            "supertrend_trend_11_2.0",
+            "hour",
+        ]
+    )
+
+
+@dataclass
+class OutputConfig:
+    """Configuration for outputs and artifacts."""
+    charts_dir: str = "charts"
+    results_csv: str = "../data/cumulative_feature_results.csv"
+    model_output_path: str = "../data/xgboost_model_all_features.json"
+
+
+@dataclass
+class RunConfig:
+    """Top-level configuration grouping for a pipeline run."""
+    data: DataConfig
+    features: FeatureConfig = field(default_factory=FeatureConfig)
+    preprocess: PreprocessConfig = field(default_factory=PreprocessConfig)
+    pruning: PruningConfig = field(default_factory=PruningConfig)
+    output: OutputConfig = field(default_factory=OutputConfig)
+
+
--- a/ohlcvpredictor/data.py
+++ b/ohlcvpredictor/data.py
@@ -0,0 +1,38 @@
+from typing import Tuple
+import os
+import pandas as pd
+import numpy as np
+
+from .config import DataConfig
+
+
+def load_and_filter_data(cfg: DataConfig) -> pd.DataFrame:
+    """Load CSV, filter, and convert timestamp.
+
+    - Reads the CSV at cfg.csv_path
+    - Drops rows with Volume == 0 if configured
+    - Converts 'Timestamp' from seconds to datetime and filters by cfg.min_date
+    - Adds 'log_return' target column
+    """
+    if not os.path.exists(cfg.csv_path):
+        raise FileNotFoundError(f"CSV not found: {cfg.csv_path}")
+
+    df = pd.read_csv(cfg.csv_path)
+    if cfg.drop_volume_zero and 'Volume' in df.columns:
+        df = df[df['Volume'] != 0]
+
+    if 'Timestamp' not in df.columns:
+        raise ValueError("Expected 'Timestamp' column in input CSV")
+
+    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
+    df = df[df['Timestamp'] >= cfg.min_date]
+    if cfg.max_date:
+        df = df[df['Timestamp'] <= cfg.max_date]
+
+    if 'Close' not in df.columns:
+        raise ValueError("Expected 'Close' column in input CSV")
+
+    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
+    return df
+
+
--- a/ohlcvpredictor/metrics.py
+++ b/ohlcvpredictor/metrics.py
@@ -0,0 +1,26 @@
+from typing import Dict, Tuple
+import numpy as np
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+def compute_price_series_from_log_returns(start_price: float, log_returns: np.ndarray) -> np.ndarray:
+    """Reconstruct price series from log returns starting at start_price."""
+    prices = [start_price]
+    for r in log_returns:
+        prices.append(prices[-1] * float(np.exp(r)))
+    return np.asarray(prices[1:])
+
+
+def compute_metrics_from_prices(actual_prices: np.ndarray, predicted_prices: np.ndarray) -> Dict[str, float]:
+    """Compute RMSE, MAPE, R2, and directional accuracy given price series."""
+    rmse = float(np.sqrt(mean_squared_error(actual_prices, predicted_prices)))
+    with np.errstate(divide='ignore', invalid='ignore'):
+        mape_arr = np.abs((actual_prices - predicted_prices) / np.where(actual_prices == 0, np.nan, actual_prices))
+    mape = float(np.nanmean(mape_arr) * 100.0)
+    r2 = float(r2_score(actual_prices, predicted_prices))
+    direction_actual = np.sign(np.diff(actual_prices))
+    direction_pred = np.sign(np.diff(predicted_prices))
+    dir_acc = float((direction_actual == direction_pred).mean()) if len(direction_actual) > 0 else 0.0
+    return {"rmse": rmse, "mape": mape, "r2": r2, "directional_accuracy": dir_acc}
+
+
--- a/ohlcvpredictor/model.py
+++ b/ohlcvpredictor/model.py
@@ -0,0 +1,28 @@
+from typing import Dict, List, Tuple
+import numpy as np
+
+from custom_xgboost import CustomXGBoostGPU
+
+
+def train_model(
+    X_train: np.ndarray,
+    X_test: np.ndarray,
+    y_train: np.ndarray,
+    y_test: np.ndarray,
+    eval_metric: str = 'rmse',
+):
+    """Train the XGBoost model and return the fitted wrapper."""
+    model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
+    model.train(eval_metric=eval_metric)
+    return model
+
+
+def predict(model: CustomXGBoostGPU, X: np.ndarray) -> np.ndarray:
+    """Predict using the trained model."""
+    return model.predict(X)
+
+
+def get_feature_importance(model: CustomXGBoostGPU, feature_names: List[str]) -> Dict[str, float]:
+    return model.get_feature_importance(feature_names)
+
+
--- a/ohlcvpredictor/pipeline.py
+++ b/ohlcvpredictor/pipeline.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from typing import Dict, List, Tuple
+import os
+import csv
+import json
+import numpy as np
+import pandas as pd
+
+from .config import RunConfig
+from .data import load_and_filter_data
+from .preprocess import add_basic_time_features, downcast_numeric_columns, handle_nans
+from .selection import build_feature_list, prune_features
+from .model import train_model, predict, get_feature_importance
+from .metrics import compute_price_series_from_log_returns, compute_metrics_from_prices
+from evaluation import walk_forward_cv
+from feature_engineering import feature_engineering
+from plot_results import plot_prediction_error_distribution
+
+
+def ensure_charts_dir(path: str) -> None:
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+
+
+def run_pipeline(cfg: RunConfig) -> Dict[str, float]:
+    # Setup outputs
+    ensure_charts_dir(cfg.output.charts_dir)
+
+    # Load and target
+    df = load_and_filter_data(cfg.data)
+
+    # Features
+    features_dict = feature_engineering(
+        df,
+        os.path.splitext(os.path.basename(cfg.data.csv_path))[0],
+        cfg.features.ohlcv_cols,
+        cfg.features.lags,
+        cfg.features.window_sizes,
+    )
+    features_df = pd.DataFrame(features_dict)
+    df = pd.concat([df, features_df], axis=1)
+
+    # Preprocess
+    df = downcast_numeric_columns(df)
+    df = add_basic_time_features(df)
+    df = handle_nans(df, cfg.preprocess)
+
+    # Feature selection and pruning
+    feature_cols = build_feature_list(df.columns)
+
+    X = df[feature_cols].values.astype(np.float32)
+    y = df["log_return"].values.astype(np.float32)
+    split_idx = int(len(X) * 0.8)
+    X_train, X_test = X[:split_idx], X[split_idx:]
+    y_train, y_test = y[:split_idx], y[split_idx:]
+
+    importance_avg = None
+    if cfg.pruning.do_walk_forward_cv:
+        metrics_avg, importance_avg = walk_forward_cv(X, y, feature_cols, n_splits=cfg.pruning.n_splits)
+        # Optional: you may log or return metrics_avg
+
+    kept_feature_cols = prune_features(feature_cols, importance_avg, cfg.pruning) if cfg.pruning.auto_prune else feature_cols
+
+    # Train model
+    model = train_model(
+        df[kept_feature_cols].values.astype(np.float32)[:split_idx],
+        df[kept_feature_cols].values.astype(np.float32)[split_idx:],
+        y[:split_idx],
+        y[split_idx:],
+        eval_metric='rmse',
+    )
+
+    # Save model
+    model.save_model(cfg.output.model_output_path)
+
+    # Persist the exact feature list used for training next to the model
+    try:
+        features_path = os.path.splitext(cfg.output.model_output_path)[0] + "_features.json"
+        with open(features_path, "w") as f:
+            json.dump({"feature_names": kept_feature_cols}, f)
+    except Exception:
+        # Feature list persistence is optional; avoid breaking the run on failure
+        pass
+
+    # Predict
+    X_test_kept = df[kept_feature_cols].values.astype(np.float32)[split_idx:]
+    test_preds = predict(model, X_test_kept)
+
+    # Reconstruct price series
+    close_prices = df['Close'].values
+    start_price = close_prices[split_idx]
+    actual_prices = compute_price_series_from_log_returns(start_price, y_test)
+    predicted_prices = compute_price_series_from_log_returns(start_price, test_preds)
+
+    # Metrics
+    metrics = compute_metrics_from_prices(actual_prices, predicted_prices)
+
+    # Plot prediction error distribution to charts dir (parity with previous behavior)
+    try:
+        plot_prediction_error_distribution(predicted_prices, actual_prices, prefix="all_features")
+    except Exception:
+        # plotting is optional; ignore failures in headless environments
+        pass
+
+    # Persist per-feature metrics and importances
+    feat_importance = get_feature_importance(model, kept_feature_cols)
+    if not os.path.exists(cfg.output.results_csv):
+        with open(cfg.output.results_csv, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['feature', 'rmse', 'mape', 'r2', 'directional_accuracy', 'feature_importance'])
+    with open(cfg.output.results_csv, 'a', newline='') as f:
+        writer = csv.writer(f)
+        for feature in kept_feature_cols:
+            importance = feat_importance.get(feature, 0.0)
+            row = [feature]
+            for key in ['rmse', 'mape', 'r2', 'directional_accuracy']:
+                val = metrics[key]
+                row.append(f"{val:.10f}")
+            row.append(f"{importance:.6f}")
+            writer.writerow(row)
+
+    return metrics
+
+
--- a/ohlcvpredictor/preprocess.py
+++ b/ohlcvpredictor/preprocess.py
@@ -0,0 +1,39 @@
+from typing import List
+import pandas as pd
+import numpy as np
+
+from .config import PreprocessConfig
+
+
+def add_basic_time_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Add basic time features such as hour-of-day."""
+    df = df.copy()
+    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
+    df['hour'] = df['Timestamp'].dt.hour
+    return df
+
+
+def downcast_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Downcast numeric columns to save memory."""
+    df = df.copy()
+    for col in df.columns:
+        try:
+            df[col] = pd.to_numeric(df[col], downcast='float')
+        except Exception:
+            # ignore non-numeric columns
+            pass
+    return df
+
+
+def handle_nans(df: pd.DataFrame, cfg: PreprocessConfig) -> pd.DataFrame:
+    """Impute NaNs (mean) or drop rows, based on config."""
+    df = df.copy()
+    if cfg.impute_nans:
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            df[col] = df[col].fillna(df[col].mean())
+    else:
+        df = df.dropna().reset_index(drop=True)
+    return df
+
+
--- a/ohlcvpredictor/selection.py
+++ b/ohlcvpredictor/selection.py
@@ -0,0 +1,59 @@
+from typing import Dict, Iterable, List, Sequence, Set, Tuple
+import numpy as np
+
+from .config import PruningConfig
+
+
+EXCLUDE_BASE_FEATURES: List[str] = [
+    'Timestamp', 'Close',
+    'log_return_5', 'volatility_5', 'volatility_15', 'volatility_30',
+    'bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
+    'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
+    'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
+    'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
+    'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
+    'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
+    'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
+    'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
+    'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
+    'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
+    'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0',
+]
+
+
+def build_feature_list(all_columns: Sequence[str]) -> List[str]:
+    """Return the model feature list by excluding base columns and targets."""
+    return [col for col in all_columns if col not in EXCLUDE_BASE_FEATURES]
+
+
+def prune_features(
+    feature_cols: Sequence[str],
+    importance_avg: Dict[str, float] | None,
+    cfg: PruningConfig,
+) -> List[str]:
+    """Decide which features to keep using averaged importances and rules."""
+    prune_set: Set[str] = set()
+
+    if importance_avg is not None:
+        sorted_feats = sorted(importance_avg.items(), key=lambda kv: kv[1], reverse=True)
+        keep_names = set(name for name, _ in sorted_feats[: cfg.top_k])
+        for name in feature_cols:
+            if name not in keep_names:
+                prune_set.add(name)
+
+    for name in cfg.known_low_features:
+        if name in feature_cols:
+            prune_set.add(name)
+
+    # If Parkinson vol exists, drop alternatives at same window
+    for w in [5, 15, 30]:
+        park = f'park_vol_{w}'
+        if park in feature_cols:
+            for alt in [f'gk_vol_{w}', f'rs_vol_{w}', f'yz_vol_{w}']:
+                if alt in feature_cols:
+                    prune_set.add(alt)
+
+    kept = [c for c in feature_cols if c not in prune_set]
+    return kept
+
+
--- a/predictor.py
+++ b/predictor.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
 import os
+import json

 try:
    from .custom_xgboost import CustomXGBoostGPU
@@ -19,6 +20,21 @@ class OHLCVPredictor:
            
        self.model = CustomXGBoostGPU.load_model(model_path)
        self.exclude_cols = self._get_excluded_features()
+        self._feature_names = self._load_trained_feature_names(model_path)
+
+    def _load_trained_feature_names(self, model_path: str):
+        """Load the exact feature list saved during training, if present."""
+        try:
+            features_path = os.path.splitext(model_path)[0] + "_features.json"
+            if os.path.exists(features_path):
+                with open(features_path, "r") as f:
+                    data = json.load(f)
+                names = data.get("feature_names")
+                if isinstance(names, list) and all(isinstance(x, str) for x in names):
+                    return names
+        except Exception:
+            pass
+        return None
    
    def _get_excluded_features(self):
        """Get the list of features to exclude (copied from main.py)"""
@@ -80,7 +96,14 @@ class OHLCVPredictor:
        df = df.copy()
        
        # Select features and predict
-        feature_cols = [col for col in df.columns if col not in self.exclude_cols]
+        if self._feature_names is not None:
+            # Use the exact training feature names and order
+            missing = [c for c in self._feature_names if c not in df.columns]
+            if missing:
+                raise ValueError(f"Input is missing required trained features: {missing[:10]}{'...' if len(missing)>10 else ''}")
+            feature_cols = self._feature_names
+        else:
+            feature_cols = [col for col in df.columns if col not in self.exclude_cols]
        X = df[feature_cols].values.astype(np.float32)
        return self.model.predict(X)