WIP trend analysis

This commit is contained in:
Simon Moisy
2025-03-18 10:13:37 +08:00
parent 302be95ce7
commit 6d9189d0be
6 changed files with 318 additions and 15 deletions

15
utils/datasets.py Normal file
View File

@@ -0,0 +1,15 @@
import os
import subprocess
class Datasets:
@staticmethod
def download_kaggle_dataset(dataset_id, download_path):
os.environ["KAGGLE_CONFIG_DIR"] = os.path.expanduser("~/.kaggle")
command = ["kaggle", "datasets", "download", "-d", dataset_id, "-p", download_path]
try:
subprocess.run(command, check=True)
print(f"Dataset downloaded successfully to {download_path}")
except subprocess.CalledProcessError as e:
print(f"Error downloading dataset: {e}")

View File

@@ -0,0 +1,5 @@
from datasets import Datasets
dataset_id = "mczielinski/bitcoin-historical-data"
download_path = "./data"
Datasets.download_kaggle_dataset(dataset_id, download_path)

36
utils/drop_data.py Normal file
View File

@@ -0,0 +1,36 @@
import sqlite3
from datetime import datetime
# Specify the database file path
db_path = 'bitcoin_historical_data.db'
# Create a connection to the database
connection = sqlite3.connect(db_path)
# Create a cursor object
cursor = connection.cursor()
# Define the date threshold
date_threshold = datetime(2025, 1, 15)
# Convert the date threshold to the format used in SQLite (YYYY-MM-DD HH:MM:SS.SSS)
date_threshold_str = date_threshold.strftime('%Y-%m-%d 00:00:00.000')
# SQL query to delete rows with Timestamp greater than the date threshold
query = """
DELETE FROM bitcoin_data
WHERE Timestamp > ?
"""
# Execute the query with the date threshold as a parameter
cursor.execute(query, (date_threshold_str,))
# Commit the changes
connection.commit()
# Get the number of deleted rows
deleted_rows = cursor.rowcount
print(f"Deleted {deleted_rows} rows with Timestamp greater than January 15th, 2025")
# Close the connection
connection.close()

View File

@@ -0,0 +1,32 @@
import pandas as pd
from sqlalchemy import create_engine, text
# Load the dataset
df = pd.read_csv('./data/btcusd_1-min_data.csv')
# Preprocess the data
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df.set_index('Timestamp', inplace=True)
# Remove rows with invalid Timestamps
df = df[~df.index.isna()]
# Create a connection to the SQLite database
engine = create_engine('sqlite:///bitcoin_historical_data.db')
# Check if the table already exists and get the last timestamp from the database
with engine.connect() as connection:
query = text("SELECT MAX(Timestamp) FROM bitcoin_data")
last_timestamp = connection.execute(query).fetchone()[0]
# If there is no data in the table, last_timestamp will be None
if last_timestamp is not None:
# Filter the new data to include only rows with a timestamp later than the last timestamp in the database
df = df[df.index > last_timestamp]
# If there are new rows, append them to the database
if not df.empty:
df.to_sql('bitcoin_data', engine, if_exists='append', index=True)
print(f"Added {len(df)} new rows to the database.")
else:
print("No new data to add.")