init

2025-03-13 15:21:06 +08:00
parent ed67968cc2
commit 302be95ce7
13 changed files with 690 additions and 0 deletions
--- a/.cursorignore
+++ b/.cursorignore
@@ -0,0 +1,11 @@
+bitcoin_historical_data.db
+docker_run.sh
+Dockerfile
+data/*
+models/*
+plots/*
+results/*
+tests/*
+.gitignore
+.git/*
+.gitignore
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,34 @@
+FROM tensorflow/tensorflow:latest-gpu
+
+# Install zstd, wget, python3-pip, and ngrok dependencies
+RUN apt-get update && apt-get install -y zstd wget python3-pip unzip
+
+# Download and set up CUDA repository pin
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
+
+# Download and install CUDA repository
+RUN wget https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda-repo-ubuntu2204-12-3-local_12.3.2-545.23.08-1_amd64.deb && \
+    dpkg -i cuda-repo-ubuntu2204-12-3-local_12.3.2-545.23.08-1_amd64.deb && \
+    cp /var/cuda-repo-ubuntu2204-12-3-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cuda-toolkit-12-3
+
+# Download and install cuDNN repository
+RUN wget https://developer.download.nvidia.com/compute/cudnn/9.7.1/local_installers/cudnn-local-repo-ubuntu2204-9.7.1_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.7.1_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.7.1/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn
+
+# Set environment variables
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+
+# Verify the installation
+RUN nvcc --version
+RUN nvidia-smi
+
+# Install Python packages
+RUN pip install sqlalchemy scikit-learn pandas matplotlib kaggle
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,4 @@
+{
+	"name": "marketparser",
+	"image": "marketparser:latest"
+}
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,12 @@
 # ---> Python
 # Byte-compiled / optimized / DLL files
+
+data/
+.zip
+.db
+plots/
+results/
+
 __pycache__/
 *.py[cod]
 *$py.class
--- a/BitcoinPricePredictor.py
+++ b/BitcoinPricePredictor.py
@@ -0,0 +1,496 @@
+import os
+from datetime import datetime
+import time
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import confusion_matrix, classification_report
+from sqlalchemy import create_engine
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.regularizers import l1_l2 
+from tensorflow.keras.layers import LSTM, Dense, Dropout
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import RobustScaler
+import gc
+import matplotlib.pyplot as plt
+
+
+class BitcoinPricePredictor:
+    def __init__(self, db_path, timeframe, model=None, timesteps=10, batch_size=8, learning_rate=0.001, epochs=50):
+        self.db_path = db_path
+        self.engine = create_engine(f'sqlite:///{self.db_path}')
+        self.timesteps = timesteps
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.epochs = epochs
+        self.model = model
+        self.X_train = None
+        self.X_test = None
+        self.y_train = None
+        self.y_test = None
+        self.history = None
+        self.scaler = None
+        self.timeframe = timeframe
+        self.feature_columns = ['Open', 'High', 'Low', 'Close', 'Volume',
+                                'HL_Ratio', 'SMA_7', 'SMA_21', 'Price_Change']
+
+    @staticmethod
+    def reduce_mem_usage(df):
+        """Optimize memory usage of the dataframe by downcasting numeric types."""
+        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+        start_mem = df.memory_usage().sum() / 1024**2
+        
+        for col in df.columns:
+            col_type = df[col].dtypes
+            if col_type in numerics:  # Only process numeric columns
+                c_min = df[col].min()
+                c_max = df[col].max()
+                if str(col_type)[:3] == 'int':
+                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
+                        df[col] = df[col].astype(np.int8)
+                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
+                        df[col] = df[col].astype(np.int16)
+                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
+                        df[col] = df[col].astype(np.int32)
+                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
+                        df[col] = df[col].astype(np.int64)
+                else:
+                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
+                        df[col] = df[col].astype(np.float16)
+                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
+                        df[col] = df[col].astype(np.float32)
+                    else:
+                        df[col] = df[col].astype(np.float64)
+        
+        end_mem = df.memory_usage().sum() / 1024**2
+        print(f'Memory usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB ({(1 - end_mem/start_mem)*100:.1f}% reduction)')
+        return df
+
+    def add_essential_features(self, df):
+        print("Adding technical indicators and features...")
+
+        df = df.copy()
+        
+        # Price ratio features
+        df['HL_Ratio'] = (df['High'] / df['Low']).clip(lower=0.8, upper=1.2)
+        
+        # Moving averages with different timeframes
+        df['SMA_7'] = df['Close'].rolling(window=7, min_periods=1).mean()
+        df['SMA_21'] = df['Close'].rolling(window=21, min_periods=1).mean()
+        df['SMA_50'] = df['Close'].rolling(window=50, min_periods=1).mean()
+        
+        # Exponential moving averages
+        df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
+        df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
+        
+        # MACD
+        df['MACD'] = df['EMA_12'] - df['EMA_26']
+        df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
+        
+        # Relative Strength Index (RSI)
+        delta = df['Close'].diff()
+        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
+        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
+        rs = gain / loss
+        df['RSI'] = 100 - (100 / (1 + rs))
+        
+        # Bollinger Bands
+        df['BB_Middle'] = df['Close'].rolling(window=20).mean()
+        df['BB_Std'] = df['Close'].rolling(window=20).std()
+        df['BB_Upper'] = df['BB_Middle'] + 2 * df['BB_Std']
+        df['BB_Lower'] = df['BB_Middle'] - 2 * df['BB_Std']
+        df['BB_Width'] = (df['BB_Upper'] - df['BB_Lower']) / df['BB_Middle']
+        
+        # Price changes at different timeframes
+        df['Price_Change_1d'] = df['Close'].pct_change(periods=1).clip(lower=-0.5, upper=0.5)
+        df['Price_Change_3d'] = df['Close'].pct_change(periods=3).clip(lower=-0.5, upper=0.5)
+        df['Price_Change_7d'] = df['Close'].pct_change(periods=7).clip(lower=-0.5, upper=0.5)
+        
+        # Volatility
+        df['Volatility'] = df['Close'].rolling(window=14).std() / df['Close'].rolling(window=14).mean()
+        
+        # Clean up any NaN or infinite values
+        df = df.fillna(0)
+        df = df.replace([np.inf, -np.inf], 0)
+        
+        # Update feature columns list
+        self.feature_columns = [col for col in df.columns if col not in ['Next_Period_Return', 'Next_Period_Up']]
+        
+        print(f"Shape after adding features: {df.shape}")
+        return df
+
+    def create_sequences(self, data, target):
+        """Create sequences of data for LSTM input with corresponding targets."""
+        x, y = [], []
+        for i in range(len(data) - self.timesteps):
+            x.append(data[i:i + self.timesteps])
+            y.append(target[i + self.timesteps])
+        return np.array(x, dtype=np.float32), np.array(y, dtype=np.float32)
+
+    def create_sequences_for_prediction(self, data):
+        """Create sequences of data for prediction without targets."""
+        x = []
+        for i in range(len(data) - self.timesteps):
+            x.append(data[i:i + self.timesteps])
+        return np.array(x, dtype=np.float32)
+
+    def create_model(self, input_shape):
+        """Create and compile the LSTM model architecture."""
+        model = Sequential([
+            LSTM(64, return_sequences=True, input_shape=input_shape, 
+                 recurrent_dropout=0.2, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)),
+            Dropout(0.3),
+            LSTM(32, return_sequences=True, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)),
+            Dropout(0.2),
+            LSTM(16),
+            Dropout(0.2),
+            Dense(8, activation='relu'),
+            Dense(1, activation='sigmoid')
+        ])
+        
+        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
+        model.compile(optimizer=optimizer, 
+                     loss='binary_crossentropy', 
+                     metrics=['accuracy'])
+        
+        print(model.summary())
+        return model
+
+
+    def load_and_prepare_data(self):
+        start_time = time.time()
+        
+        print("Loading data from database...")
+        df = pd.read_sql('SELECT * FROM bitcoin_data', self.engine, index_col='Timestamp', parse_dates=['Timestamp'])
+        print(f"Initial dataset shape: {df.shape}")
+        print(f"Timeframe: {self.timeframe}")
+
+        df = self.resample_data(df)
+
+        df = self.add_essential_features(df)
+
+        # Define target variable - binary classification for price movement
+        df['Next_Period_Return'] = df['Close'].pct_change(periods=1).shift(-1).clip(lower=-0.5, upper=0.5)
+        df['Next_Period_Up'] = (df['Next_Period_Return'] > 0).astype(np.int8)
+        df = df.dropna()
+
+        # Scale features
+        self.scaler = RobustScaler()
+        df[self.feature_columns] = self.scaler.fit_transform(df[self.feature_columns])
+
+        # Create sequences for LSTM
+        x, y = self.create_sequences(df[self.feature_columns].values, df['Next_Period_Up'].values)
+        print(f"Sequence shape: {x.shape}, Target shape: {y.shape}")
+        
+        # Class balance check
+        class_distribution = np.bincount(y.astype(int))
+        print(f"Class distribution - 0: {class_distribution[0]}, 1: {class_distribution[1]}")
+        print(f"Positive class ratio: {class_distribution[1]/len(y):.2f}")
+
+        # Train-test split (chronological)
+        split_idx = int(len(x) * 0.8)
+        x_train, x_test = x[:split_idx], x[split_idx:]
+        y_train, y_test = y[:split_idx], y[split_idx:]
+
+        # Free memory
+        del df
+        gc.collect()
+
+        self.X_train, self.X_test = x_train, x_test
+        self.y_train, self.y_test = y_train, y_test
+        
+        print(f"Training data shape: {self.X_train.shape}, Test data shape: {self.X_test.shape}")
+
+        class_counts = np.bincount(self.y_train.astype(int))
+        print(f"Class distribution in training data: 0={class_counts[0]}, 1={class_counts[1]}")
+
+        print(f"Data preparation completed in {time.time() - start_time:.2f} seconds")
+
+    def resample_data(self, df):
+        print(f"Resampling data to {self.timeframe} timeframe...")
+        df = df.resample(self.timeframe).agg({
+            'Open': 'first',
+            'High': 'max',
+            'Low': 'min',
+            'Close': 'last',
+            'Volume': 'sum'
+        })
+        print(f"Shape after resampling: {df.shape}")
+        return df
+
+    def load_new_data_from_model(self):
+        """Load new data and identify missing entries compared to the database."""
+        new_data = pd.read_csv("./data/btcusd_1-min_data.csv")
+        new_data['Timestamp'] = pd.to_datetime(new_data['Timestamp'], unit='s')
+        
+        existing_data = pd.read_sql('SELECT * FROM bitcoin_data', self.engine, index_col='Timestamp',
+                                    parse_dates=['Timestamp'])
+        
+        # Show the most recent entries in the database
+        last_entries = existing_data.sort_index(ascending=False).head(10)
+        print("Most recent entries in database:")
+        print(last_entries)
+
+        # Find missing data
+        latest_timestamp = existing_data.index.max()
+        missing_data = new_data[new_data['Timestamp'] > latest_timestamp]
+
+        print(f"New data total length: {len(new_data)}")
+        print(f"Missing data entries: {len(missing_data)}")
+        print(f"Existing data entries: {len(existing_data)}")
+        
+        return missing_data
+
+    def preprocess_data(self, data):
+        """Preprocess new data with feature engineering and scaling."""
+        # Add technical indicators
+        data = self.add_essential_features(data)
+        
+        # Scale the features using the same scaler as training data
+        if self.scaler is not None:
+            data[self.feature_columns] = self.scaler.transform(data[self.feature_columns])
+        else:
+            # If no scaler exists, fit a new one
+            scaler = RobustScaler()
+            data[self.feature_columns] = scaler.fit_transform(data[self.feature_columns])
+            
+        return data
+
+    def make_predictions_w_reality(self, new_data):
+        """Make predictions and compare with actual outcomes."""
+        # Ensure the 'Timestamp' column is present
+        if 'Timestamp' not in new_data.columns:
+            raise ValueError("Input data must contain a 'Timestamp' column.")
+
+        # Convert 'Timestamp' to datetime and set as index
+        new_data['Timestamp'] = pd.to_datetime(new_data['Timestamp'], errors='coerce')
+        new_data = new_data.dropna(subset=['Timestamp'])  # Drop rows where Timestamp is NaT
+        new_data.set_index('Timestamp', inplace=True)
+
+        # Resample and aggregate data to the specified timeframe
+        grouped_data = new_data.resample(self.timeframe).agg({
+            'Open': 'first',
+            'High': 'max',
+            'Low': 'min',
+            'Close': 'last',
+            'Volume': 'sum'
+        }).reset_index()  # Reset index to preserve 'Timestamp' as a column
+
+        if grouped_data.empty:
+            print("No new data found.")
+            return None, None
+
+        # Preprocess the data
+        grouped_data = self.preprocess_data(grouped_data)
+
+        if grouped_data.empty:
+            print("No new data after preprocessing.")
+            return None, None
+
+        # Create sequences for the model
+        X = self.create_sequences_for_prediction(grouped_data[self.feature_columns].values)
+        
+        if len(X) == 0:
+            print("Not enough data to create sequences.")
+            return None, None
+
+        # Generate predictions
+        predictions = self.model.predict(X)
+
+        # Trim 'grouped_data' to align with sequence length
+        grouped_data = grouped_data.iloc[self.timesteps:]  # Align with sequence length
+
+        # Add predictions to the grouped_data DataFrame
+        grouped_data['Predictions'] = (predictions > 0.5).astype(int)
+        grouped_data['Prediction_Probability'] = predictions
+
+        # Calculate reality (actual price movement)
+        grouped_data['Reality'] = (grouped_data['Close'].pct_change() > 0.005).astype(int)
+        
+        # Calculate accuracy
+        grouped_data['Correct'] = (grouped_data['Predictions'] == grouped_data['Reality']).astype(int)
+        accuracy = grouped_data['Correct'].mean()
+        print(f"Prediction accuracy: {accuracy:.2f}")
+
+        # Return predictions and reality
+        return grouped_data[['Timestamp', 'Predictions', 'Prediction_Probability']], grouped_data['Reality']
+
+    def make_predictions(self, new_data):
+        """Make predictions on new data."""
+        # Ensure the 'Timestamp' column is present
+        if 'Timestamp' not in new_data.columns:
+            raise ValueError("Input data must contain a 'Timestamp' column.")
+
+        # Convert 'Timestamp' to datetime and set as index
+        new_data['Timestamp'] = pd.to_datetime(new_data['Timestamp'], errors='coerce')
+        new_data = new_data.dropna(subset=['Timestamp'])  # Drop rows where Timestamp is NaT
+        new_data.set_index('Timestamp', inplace=True)
+
+        # Resample and aggregate data to the specified timeframe
+        grouped_data = new_data.resample(self.timeframe).agg({
+            'Open': 'first',
+            'High': 'max',
+            'Low': 'min',
+            'Close': 'last',
+            'Volume': 'sum'
+        }).reset_index()  # Reset index to preserve 'Timestamp' as a column
+
+        if grouped_data.empty:
+            print("No new data found.")
+            return None
+
+        # Preprocess the data
+        grouped_data = self.preprocess_data(grouped_data)
+
+        if grouped_data.empty:
+            print("No new data after preprocessing.")
+            return None
+
+        # Create sequences for the model
+        X = self.create_sequences_for_prediction(grouped_data[self.feature_columns].values)
+        
+        if len(X) == 0:
+            print("Not enough data to create sequences.")
+            return None
+
+        # Generate predictions
+        predictions = self.model.predict(X)
+
+        # Trim 'grouped_data' to align with sequence length
+        grouped_data = grouped_data.iloc[self.timesteps:]
+
+        # Add predictions to the grouped_data DataFrame
+        grouped_data['Predictions'] = (predictions > 0.5).astype(int)
+        grouped_data['Prediction_Probability'] = predictions.flatten()
+
+        # Return prediction results
+        return grouped_data[['Timestamp', 'Predictions', 'Prediction_Probability']]
+
+    def update_database(self, missing_data):
+        """Update the database with the missing data."""
+        if missing_data.empty:
+            print("No new data to add to the database.")
+            return
+            
+        missing_data.to_sql('bitcoin_data', self.engine, if_exists='append', index=False)
+        print(f"Database updated with {len(missing_data)} new rows.")
+
+    def train_model(self):
+        """Train the LSTM model with early stopping and checkpointing."""
+        if self.X_train is None or self.y_train is None:
+            raise ValueError("Data not loaded. Call load_and_prepare_data() first.")
+            
+        # Create model directory if it doesn't exist
+        os.makedirs("./models", exist_ok=True)
+            
+        # Configure TensorFlow to use memory growth
+        gpus = tf.config.experimental.list_physical_devices('GPU')
+        if gpus:
+            try:
+                # Limit TensorFlow to use only 80% of GPU memory
+                for gpu in gpus:
+                    tf.config.experimental.set_virtual_device_configuration(
+                        gpu,
+                        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]  # Set to 2GB or adjust as needed
+                    )
+                print("GPU memory limit set")
+            except RuntimeError as e:
+                print(f"GPU memory limit setting failed: {e}")
+        
+        # Create the model
+        self.model = self.create_model(input_shape=(self.timesteps, len(self.feature_columns)))
+        
+        # Setup callbacks
+        current_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        checkpoint_path = f"./models/model_checkpoint_{current_date}.keras"
+        
+        callbacks = [
+            EarlyStopping(
+                monitor='val_loss',
+                patience=10,
+                restore_best_weights=True,
+                verbose=1
+            ),
+            ModelCheckpoint(
+                filepath=checkpoint_path,
+                save_best_only=True,
+                monitor='val_loss',
+                verbose=1
+            )
+        ]
+
+        # Train the model
+        print(f"Training model with {self.epochs} epochs and batch size {self.batch_size}...")
+        self.history = self.model.fit(
+            self.X_train, self.y_train,
+            validation_data=(self.X_test, self.y_test),
+            epochs=self.epochs,
+            batch_size=self.batch_size,
+            callbacks=callbacks,
+            verbose=1
+        )
+        
+        # Save the final model
+        final_model_path = f"./models/model_final_{current_date}.keras"
+        self.model.save(final_model_path)
+        print(f"Model saved to {final_model_path}")
+
+    def evaluate_model(self):
+        """Evaluate the trained model on test data."""
+        if self.model is None:
+            raise ValueError("Model not trained. Call train_model() first.")
+            
+        # Basic evaluation
+        test_loss, test_accuracy = self.model.evaluate(self.X_test, self.y_test, verbose=1)
+        print(f"\nTest Accuracy: {test_accuracy:.4f}")
+        print(f"Test Loss: {test_loss:.4f}")
+        
+        # Make predictions on test data
+        y_pred = (self.model.predict(self.X_test) > 0.5).astype(int)
+        
+        # Calculate confusion matrix
+        cm = confusion_matrix(self.y_test, y_pred)
+        print("\nConfusion Matrix:")
+        print(cm)
+        
+        # Print classification report
+        print("\nClassification Report:")
+        print(classification_report(self.y_test, y_pred))
+
+    def plot_history(self):
+        """Plot the training history metrics."""
+        if self.history is None:
+            raise ValueError("No training history available. Train the model first.")
+            
+        plt.figure(figsize=(15, 6))
+        
+        # Plot accuracy
+        plt.subplot(1, 2, 1)
+        plt.plot(self.history.history['accuracy'], label='Training Accuracy')
+        plt.plot(self.history.history['val_accuracy'], label='Validation Accuracy')
+        plt.title('Model Accuracy')
+        plt.xlabel('Epoch')
+        plt.ylabel('Accuracy')
+        plt.legend(loc='lower right')
+        plt.grid(True)
+        
+        # Plot loss
+        plt.subplot(1, 2, 2)
+        plt.plot(self.history.history['loss'], label='Training Loss')
+        plt.plot(self.history.history['val_loss'], label='Validation Loss')
+        plt.title('Model Loss')
+        plt.xlabel('Epoch')
+        plt.ylabel('Loss')
+        plt.legend(loc='upper right')
+        plt.grid(True)
+        
+        plt.tight_layout()
+        
+        # Save the plot
+        os.makedirs("./plots", exist_ok=True)
+        current_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        plt.savefig(f"./plots/training_history_{current_date}.png")
+        
+        plt.show()
--- a/main.py
+++ b/main.py
@@ -0,0 +1,16 @@
+from BitcoinPricePredictor import BitcoinPricePredictor
+
+if __name__ == "__main__":
+    # For daily predictions (default)
+    predictor_daily = BitcoinPricePredictor(db_path='bitcoin_historical_data.db', timeframe='H')
+    
+    # For weekly predictions
+    # predictor_weekly = BitcoinPricePredictor(db_path='bitcoin_historical_data.db', timeframe='W')
+    
+    # Choose which predictor to use
+    predictor = predictor_daily
+    
+    predictor.load_and_prepare_data()
+    predictor.train_model()
+    predictor.evaluate_model()
+    predictor.plot_history()
--- a/old/datasets.py
+++ b/old/datasets.py
@@ -0,0 +1,15 @@
+import os
+import subprocess
+
+class Datasets:
+    @staticmethod
+    def download_kaggle_dataset(dataset_id, download_path):
+        os.environ["KAGGLE_CONFIG_DIR"] = os.path.expanduser("~/.kaggle")
+        command = ["kaggle", "datasets", "download", "-d", dataset_id, "-p", download_path]
+
+        try:
+            subprocess.run(command, check=True)
+            print(f"Dataset downloaded successfully to {download_path}")
+        except subprocess.CalledProcessError as e:
+            print(f"Error downloading dataset: {e}")
+
--- a/old/download_dataset.py
+++ b/old/download_dataset.py
@@ -0,0 +1,5 @@
+from datasets import Datasets
+
+dataset_id = "mczielinski/bitcoin-historical-data"
+download_path = "./data"
+Datasets.download_kaggle_dataset(dataset_id, download_path)
--- a/old/drop_data.py
+++ b/old/drop_data.py
@@ -0,0 +1,36 @@
+import sqlite3
+from datetime import datetime
+
+# Specify the database file path
+db_path = 'bitcoin_historical_data.db'
+
+# Create a connection to the database
+connection = sqlite3.connect(db_path)
+
+# Create a cursor object
+cursor = connection.cursor()
+
+# Define the date threshold
+date_threshold = datetime(2025, 1, 15)
+
+# Convert the date threshold to the format used in SQLite (YYYY-MM-DD HH:MM:SS.SSS)
+date_threshold_str = date_threshold.strftime('%Y-%m-%d 00:00:00.000')
+
+# SQL query to delete rows with Timestamp greater than the date threshold
+query = """
+DELETE FROM bitcoin_data
+WHERE Timestamp > ?
+"""
+
+# Execute the query with the date threshold as a parameter
+cursor.execute(query, (date_threshold_str,))
+
+# Commit the changes
+connection.commit()
+
+# Get the number of deleted rows
+deleted_rows = cursor.rowcount
+print(f"Deleted {deleted_rows} rows with Timestamp greater than January 15th, 2025")
+
+# Close the connection
+connection.close()
--- a/old/new_data_prediction.py
+++ b/old/new_data_prediction.py
@@ -0,0 +1,21 @@
+from BitcoinPricePredictor import BitcoinPricePredictor
+from tensorflow.keras.models import load_model
+from sklearn.metrics import confusion_matrix
+
+if __name__ == "__main__":
+    model = load_model('models/model_2025-01-21_04-49-43.h5')
+    predictor = BitcoinPricePredictor(model=model, db_path='bitcoin_historical_data.db')
+
+    missing_data = predictor.load_new_data_from_model()
+
+    print(f"missing data {len(missing_data)}")
+    if not missing_data.empty:
+        predictions, reality = predictor.make_predictions_w_reality(missing_data)
+        print(f"predictions {len(predictions)}")
+
+        cm = confusion_matrix(reality, predictions[1:])
+        print("Confusion Matrix:")
+        print(cm)
+    else:
+        print("No new data found.")
+
--- a/old/parse_btc_csv_to_db.py
+++ b/old/parse_btc_csv_to_db.py
@@ -0,0 +1,32 @@
+import pandas as pd
+from sqlalchemy import create_engine, text
+
+# Load the dataset
+df = pd.read_csv('./data/btcusd_1-min_data.csv')
+
+# Preprocess the data
+df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
+df.set_index('Timestamp', inplace=True)
+
+# Remove rows with invalid Timestamps
+df = df[~df.index.isna()]
+
+# Create a connection to the SQLite database
+engine = create_engine('sqlite:///bitcoin_historical_data.db')
+
+# Check if the table already exists and get the last timestamp from the database
+with engine.connect() as connection:
+    query = text("SELECT MAX(Timestamp) FROM bitcoin_data")
+    last_timestamp = connection.execute(query).fetchone()[0]
+
+# If there is no data in the table, last_timestamp will be None
+if last_timestamp is not None:
+    # Filter the new data to include only rows with a timestamp later than the last timestamp in the database
+    df = df[df.index > last_timestamp]
+
+# If there are new rows, append them to the database
+if not df.empty:
+    df.to_sql('bitcoin_data', engine, if_exists='append', index=True)
+    print(f"Added {len(df)} new rows to the database.")
+else:
+    print("No new data to add.")
--- a/tests/test_if_GPU.py
+++ b/tests/test_if_GPU.py
@@ -0,0 +1,2 @@
+from tensorflow.python.client import device_lib
+print(device_lib.list_local_devices())
--- a/tests/test_kaggle_download.py
+++ b/tests/test_kaggle_download.py
@@ -0,0 +1,11 @@
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from datasets import Datasets
+
+download_path = "./data"  # Path where the dataset will be downloaded
+dataset_id = "mczielinski/bitcoin-historical-data"
+
+Datasets.download_kaggle_dataset(dataset_id, download_path)