diff --git a/xgboost/custom_xgboost.py b/xgboost/custom_xgboost.py index 697dc25..dd5a58a 100644 --- a/xgboost/custom_xgboost.py +++ b/xgboost/custom_xgboost.py @@ -31,3 +31,9 @@ class CustomXGBoostGPU: raise ValueError('Model not trained yet.') dmatrix = xgb.DMatrix(X.astype(np.float32)) return self.model.predict(dmatrix) + + def save_model(self, file_path): + """Save the trained XGBoost model to the specified file path.""" + if self.model is None: + raise ValueError('Model not trained yet.') + self.model.save_model(file_path) diff --git a/xgboost/main.py b/xgboost/main.py index 4110d96..cf1a462 100644 --- a/xgboost/main.py +++ b/xgboost/main.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.model_selection import train_test_split from custom_xgboost import CustomXGBoostGPU from sklearn.metrics import mean_squared_error -from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns +from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices import ta from cycles.supertrend import Supertrends from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator @@ -14,7 +14,6 @@ from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, Stochas from ta.volatility import KeltnerChannel, DonchianChannel from ta.others import DailyReturnIndicator import time -import concurrent.futures from numba import njit def run_indicator(func, *args): @@ -101,13 +100,9 @@ def calc_momentum(close): return ('momentum_10', close - close.shift(10)) def calc_psar(high, low, close): - from ta.trend import PSARIndicator - psar = PSARIndicator(high, low, close) - return [ - ('psar', psar.psar()), - ('psar_up', psar.psar_up()), - ('psar_down', psar.psar_down()) - ] + # Use the Numba-accelerated fast_psar function for speed + psar_values = fast_psar(np.array(high), np.array(low), np.array(close)) + return [('psar', pd.Series(psar_values, index=close.index))] def calc_donchian(high, low, close): from ta.volatility import DonchianChannel @@ -220,17 +215,18 @@ if __name__ == '__main__': csv_path = './data/btcusd_1-min_data.csv' csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] + print('Reading CSV and filtering data...') df = pd.read_csv(csv_path) df = df[df['Volume'] != 0] min_date = '2017-06-01' + print('Converting Timestamp and filtering by date...') df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') df = df[df['Timestamp'] >= min_date] lags = 3 print('Calculating log returns as the new target...') - df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] @@ -243,55 +239,282 @@ if __name__ == '__main__': # --- Technical Indicator Features: Calculate or Load from Cache --- print('Calculating or loading technical indicator features...') - indicator_jobs = [ - ('rsi', calc_rsi, [df['Close']]), - ('macd', calc_macd, [df['Close']]), - ('atr', calc_atr, [df['High'], df['Low'], df['Close']]), - ('cci', calc_cci, [df['High'], df['Low'], df['Close']]), - ('williams_r', calc_williamsr, [df['High'], df['Low'], df['Close']]), - ('ema_14', calc_ema, [df['Close']]), - ('obv', calc_obv, [df['Close'], df['Volume']]), - ('cmf', calc_cmf, [df['High'], df['Low'], df['Close'], df['Volume']]), - ('roc_10', calc_roc, [df['Close']]), - ('dpo_20', calc_dpo, [df['Close']]), - ('ultimate_osc', calc_ultimate, [df['High'], df['Low'], df['Close']]), - ('daily_return', calc_daily_return, [df['Close']]), - ] + # RSI + feature_file = f'./data/{csv_prefix}_rsi.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['rsi'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: rsi') + _, values = calc_rsi(df['Close']) + features_dict['rsi'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # MACD + feature_file = f'./data/{csv_prefix}_macd.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['macd'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: macd') + _, values = calc_macd(df['Close']) + features_dict['macd'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # ATR + feature_file = f'./data/{csv_prefix}_atr.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['atr'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: atr') + _, values = calc_atr(df['High'], df['Low'], df['Close']) + features_dict['atr'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # CCI + feature_file = f'./data/{csv_prefix}_cci.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['cci'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: cci') + _, values = calc_cci(df['High'], df['Low'], df['Close']) + features_dict['cci'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Williams %R + feature_file = f'./data/{csv_prefix}_williams_r.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['williams_r'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: williams_r') + _, values = calc_williamsr(df['High'], df['Low'], df['Close']) + features_dict['williams_r'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # EMA 14 + feature_file = f'./data/{csv_prefix}_ema_14.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['ema_14'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: ema_14') + _, values = calc_ema(df['Close']) + features_dict['ema_14'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # OBV + feature_file = f'./data/{csv_prefix}_obv.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['obv'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: obv') + _, values = calc_obv(df['Close'], df['Volume']) + features_dict['obv'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # CMF + feature_file = f'./data/{csv_prefix}_cmf.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['cmf'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: cmf') + _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) + features_dict['cmf'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # ROC 10 + feature_file = f'./data/{csv_prefix}_roc_10.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['roc_10'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: roc_10') + _, values = calc_roc(df['Close']) + features_dict['roc_10'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # DPO 20 + feature_file = f'./data/{csv_prefix}_dpo_20.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['dpo_20'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: dpo_20') + _, values = calc_dpo(df['Close']) + features_dict['dpo_20'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Ultimate Oscillator + feature_file = f'./data/{csv_prefix}_ultimate_osc.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: ultimate_osc') + _, values = calc_ultimate(df['High'], df['Low'], df['Close']) + features_dict['ultimate_osc'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Daily Return + feature_file = f'./data/{csv_prefix}_daily_return.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['daily_return'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: daily_return') + _, values = calc_daily_return(df['Close']) + features_dict['daily_return'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + # Multi-column indicators - multi_indicator_jobs = [ - ('bollinger', calc_bollinger, [df['Close']]), - ('stochastic', calc_stochastic, [df['High'], df['Low'], df['Close']]), - ('sma', calc_sma, [df['Close']]), - ('psar', calc_psar, [df['High'], df['Low'], df['Close']]), - ('donchian', calc_donchian, [df['High'], df['Low'], df['Close']]), - ('keltner', calc_keltner, [df['High'], df['Low'], df['Close']]), - ('ichimoku', calc_ichimoku, [df['High'], df['Low']]), - ('elder_ray', calc_elder_ray, [df['Close'], df['Low'], df['High']]), - ] - for feature_name, func, args in indicator_jobs: - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - print(f'Loading cached feature: {feature_file}') - features_dict[feature_name] = np.load(feature_file) + # Bollinger Bands + print('Calculating multi-column indicator: bollinger') + result = calc_bollinger(df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) else: - result = func(*args) - if isinstance(result, tuple): - _, values = result - features_dict[feature_name] = values - np.save(feature_file, values.values) - else: - raise ValueError(f"Unexpected result for {feature_name}") - for feature_name, func, args in multi_indicator_jobs: - # These return a list of (name, values) - result = func(*args) - for subname, values in result: - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'Loading cached feature: {sub_feature_file}') - features_dict[subname] = np.load(sub_feature_file) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Stochastic Oscillator + print('Calculating multi-column indicator: stochastic') + result = calc_stochastic(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # SMA + print('Calculating multi-column indicator: sma') + result = calc_sma(df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # PSAR + print('Calculating multi-column indicator: psar') + result = calc_psar(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Donchian Channel + print('Calculating multi-column indicator: donchian') + result = calc_donchian(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Keltner Channel + print('Calculating multi-column indicator: keltner') + result = calc_keltner(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Ichimoku + print('Calculating multi-column indicator: ichimoku') + result = calc_ichimoku(df['High'], df['Low']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Elder Ray + print('Calculating multi-column indicator: elder_ray') + result = calc_elder_ray(df['Close'], df['Low'], df['High']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') # Prepare jobs for lags, rolling stats, log returns, and volatility feature_jobs = [] @@ -301,9 +524,10 @@ if __name__ == '__main__': feature_name = f'{col}_lag{lag}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): - print(f'Loading cached feature: {feature_file}') + print(f'C Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: + print(f'Adding lag feature job: {feature_name}') feature_jobs.append((feature_name, compute_lag, col, lag)) # Rolling statistics for col in ohlcv_cols: @@ -320,48 +544,56 @@ if __name__ == '__main__': feature_name = f'{col}_roll_{stat}_{window}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): - print(f'Loading cached feature: {feature_file}') + print(f'D Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: + print(f'Adding rolling stat feature job: {feature_name}') feature_jobs.append((feature_name, compute_rolling, col, stat, window)) # Log returns for different horizons for horizon in [5, 15, 30]: feature_name = f'log_return_{horizon}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): - print(f'Loading cached feature: {feature_file}') + print(f'E Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: + print(f'Adding log return feature job: {feature_name}') feature_jobs.append((feature_name, compute_log_return, horizon)) # Volatility for window in window_sizes: feature_name = f'volatility_{window}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): - print(f'Loading cached feature: {feature_file}') + print(f'F Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: + print(f'Adding volatility feature job: {feature_name}') feature_jobs.append((feature_name, compute_volatility, window)) - # Parallel computation for all non-cached features + # Sequential computation for all non-cached features if feature_jobs: - print(f'Computing {len(feature_jobs)} features in parallel...') - with concurrent.futures.ProcessPoolExecutor() as executor: - futures = [executor.submit(run_feature_job, job, df) for job in feature_jobs] - for future in concurrent.futures.as_completed(futures): - feature_name, result = future.result() - features_dict[feature_name] = result - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - np.save(feature_file, result.values) - print('All parallel features computed.') + print(f'Computing {len(feature_jobs)} features sequentially...') + for job in feature_jobs: + print(f'Computing feature job: {job[0]}') + feature_name, result = run_feature_job(job, df) + features_dict[feature_name] = result + feature_file = f'./data/{csv_prefix}_{feature_name}.npy' + np.save(feature_file, result.values) + print(f'Saved computed feature: {feature_file}') + print('All features computed.') else: print('All features loaded from cache.') # Concatenate all new features at once print('Concatenating all new features to DataFrame...') features_df = pd.DataFrame(features_dict) + print("Columns in features_df:", features_df.columns.tolist()) + print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist()) df = pd.concat([df, features_df], axis=1) + # Print all columns after concatenation + print("All columns in df after concat:", df.columns.tolist()) + # Downcast all float columns to save memory print('Downcasting float columns to save memory...') for col in df.columns: @@ -371,6 +603,7 @@ if __name__ == '__main__': pass # Drop intermediate features_df to free memory + print('Dropping intermediate features_df to free memory...') del features_df import gc gc.collect() @@ -408,6 +641,10 @@ if __name__ == '__main__': print('Selecting feature columns...') exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] feature_cols = [col for col in df.columns if col not in exclude_cols] + + # Print the features used for training + print("Features used for training:", feature_cols) + # Drop excluded columns to save memory print('Dropping excluded columns to save memory...') df = df[feature_cols + ['log_return', 'Timestamp']] @@ -417,6 +654,7 @@ if __name__ == '__main__': y = df['log_return'].values.astype(np.float32) split_idx = int(len(X) * 0.8) + print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test') X_train, X_test = X[:split_idx], X[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:] test_timestamps = df['Timestamp'].values[split_idx:] @@ -428,7 +666,11 @@ if __name__ == '__main__': booster = model.train() print('Training complete.') - + + # Save the trained model + model.save_model('./data/xgboost_model.json') + print('Model saved to ./data/xgboost_model.json') + if hasattr(model, 'params'): print("Model hyperparameters:", model.params) if hasattr(model, 'model') and hasattr(model.model, 'get_score'): @@ -441,18 +683,49 @@ if __name__ == '__main__': for feat, score in sorted_importances: print(f'{feature_map.get(feat, feat)}: {score}') + print('Making predictions for first 5 test samples...') preds = model.predict(X_test[:5]) print('Predictions for first 5 test samples:', preds) print('Actual values for first 5 test samples:', y_test[:5]) + print('Making predictions for all test samples...') test_preds = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, test_preds)) print(f'RMSE on test set: {rmse:.4f}') + print('Saving y_test and test_preds to disk...') np.save('./data/y_test.npy', y_test) np.save('./data/test_preds.npy', test_preds) - # display_actual_vs_predicted(y_test, test_preds, test_timestamps) - # plot_target_distribution(y_train, y_test) + # Reconstruct price series from log returns + print('Reconstructing price series from log returns...') + # Get the last available Close price before the test set + # The DataFrame df has been reset, so use split_idx to get the right row + if 'Close' in df.columns: + close_prices = df['Close'].values + else: + # Reload original CSV to get Close prices if not present + close_prices = pd.read_csv(csv_path)['Close'].values + start_price = close_prices[split_idx] # This is the price at the split point + # Actual prices + actual_prices = [start_price] + for r in y_test: + actual_prices.append(actual_prices[-1] * np.exp(r)) + actual_prices = np.array(actual_prices[1:]) + # Predicted prices + predicted_prices = [start_price] + for r in test_preds: + predicted_prices.append(predicted_prices[-1] * np.exp(r)) + predicted_prices = np.array(predicted_prices[1:]) - plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps) + print('Plotting predicted vs actual prices...') + plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps) + + print("Final features used for training:", feature_cols) + + print("Shape of X:", X.shape) + print("First row of X:", X[0]) + print("stoch_k in feature_cols?", "stoch_k" in feature_cols) + if "stoch_k" in feature_cols: + idx = feature_cols.index("stoch_k") + print("First 10 values of stoch_k:", X[:10, idx]) diff --git a/xgboost/plot_results.py b/xgboost/plot_results.py index 4c9de7a..6acf429 100644 --- a/xgboost/plot_results.py +++ b/xgboost/plot_results.py @@ -109,3 +109,61 @@ def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_ ) fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) pyo.plot(fig_scatter, filename='log_return_scatter_plot.html') + +def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=None, n_plot=200): + import plotly.offline as pyo + import plotly.graph_objs as go + n_plot = min(n_plot, len(actual_prices)) + actual = actual_prices[:n_plot] + predicted = predicted_prices[:n_plot] + if timestamps is not None: + x_axis = timestamps[:n_plot] + x_label = 'Timestamp' + else: + x_axis = list(range(n_plot)) + x_label = 'Index' + + # Line plot: Actual vs Predicted over time + trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual Price') + trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted Price') + data_line = [trace_actual, trace_predicted] + layout_line = go.Layout( + title='Actual vs. Predicted BTC Prices (Test Set)', + xaxis={'title': x_label}, + yaxis={'title': 'BTC Price'}, + legend={'x': 0, 'y': 1}, + margin={'l': 40, 'b': 40, 't': 40, 'r': 10}, + hovermode='closest' + ) + fig_line = go.Figure(data=data_line, layout=layout_line) + pyo.plot(fig_line, filename='price_line_plot.html') + + # Scatter plot: Predicted vs Actual + trace_scatter = go.Scatter( + x=actual, + y=predicted, + mode='markers', + name='Predicted vs Actual', + opacity=0.5 + ) + # Diagonal reference line + min_val = min(np.min(actual), np.min(predicted)) + max_val = max(np.max(actual), np.max(predicted)) + trace_diag = go.Scatter( + x=[min_val, max_val], + y=[min_val, max_val], + mode='lines', + name='Ideal', + line=dict(dash='dash', color='red') + ) + data_scatter = [trace_scatter, trace_diag] + layout_scatter = go.Layout( + title='Predicted vs Actual Prices (Scatter)', + xaxis={'title': 'Actual Price'}, + yaxis={'title': 'Predicted Price'}, + showlegend=True, + margin={'l': 40, 'b': 40, 't': 40, 'r': 10}, + hovermode='closest' + ) + fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) + pyo.plot(fig_scatter, filename='price_scatter_plot.html')