orderflow_backtest/storage.py

207 lines
8.7 KiB
Python

"""Storage utilities to reconstruct an in-memory orderbook from a SQLite DB.
This module defines lightweight data structures for orderbook levels, trades,
and a `Storage` facade that can hydrate a `Book` incrementally from rows stored
in a SQLite file produced by an external data collector.
"""
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Iterator, Tuple
import time
import logging
from models import OrderbookLevel, Trade, BookSnapshot, Book, MetricCalculator, Metric
from repositories.sqlite_repository import SQLiteOrderflowRepository
from repositories.sqlite_metrics_repository import SQLiteMetricsRepository
from parsers.orderbook_parser import OrderbookParser
class Storage:
"""High-level facade to read historical orderflow into a `Book`.
Attributes:
instrument: Symbol/instrument name (e.g., "BTC-USDT").
book: In-memory orderbook that maintains the current state and tracks timestamps.
"""
def __init__(self, instrument: str) -> None:
self.instrument = instrument
self.book = Book()
# Pre-allocate memory for common price points
self._price_cache = {float(p/10): float(p/10) for p in range(1, 1000001, 5)}
# Debug flag
self._debug = False
self._parser = OrderbookParser(price_cache=self._price_cache, debug=self._debug)
def build_booktick_from_db(self, db_path: Path, db_date: datetime) -> None:
"""Hydrate the in-memory `book` from a SQLite database and calculate metrics.
Builds a Book instance with sequential snapshots and calculates OBI/CVD metrics.
Args:
db_path: Path to the SQLite database file.
db_date: Date associated with the database (currently informational).
"""
# Reset the book to start fresh
self.book = Book()
metrics_repo = SQLiteMetricsRepository(db_path)
with metrics_repo.connect() as conn:
# Create metrics table if it doesn't exist
if not metrics_repo.table_exists(conn, "metrics"):
metrics_repo.create_metrics_table(conn)
# Load trades grouped by timestamp
trades_by_timestamp = metrics_repo.load_trades_by_timestamp(conn)
# Check if we have any orderbook data
total_rows = metrics_repo.count_rows(conn, "book")
if total_rows == 0:
logging.info(f"No orderbook data found in {db_path}")
return
# Process orderbook data and calculate metrics
rows_iter = metrics_repo.iterate_book_rows(conn)
self._create_snapshots_and_metrics(rows_iter, trades_by_timestamp, total_rows, conn, metrics_repo)
# Log summary
logging.info(f"Processed {len(self.book.snapshots)} snapshots with metrics from {db_path}")
def _create_snapshots_and_metrics(self, rows_iter: Iterator[Tuple[int, str, str, int]], trades_by_timestamp: Dict[int, List[Trade]], total_rows: int, conn, metrics_repo: SQLiteMetricsRepository) -> None:
"""Create BookSnapshot instances and calculate metrics, storing them in database.
Args:
rows_iter: Iterator yielding (id, bids_text, asks_text, timestamp)
trades_by_timestamp: Dictionary mapping timestamps to lists of trades
total_rows: Total number of rows in the book table
conn: Database connection for storing metrics
metrics_repo: Repository instance for metrics operations
"""
# Initialize CVD tracking
current_cvd = 0.0
metrics_batch = []
batch_size = 1000 # Process metrics in batches for performance
# Set batch size and logging frequency
log_every = max(1, total_rows // 20)
processed = 0
start_time = time.time()
last_report_time = start_time
for row_id, bids_text, asks_text, timestamp in rows_iter:
snapshot = self._snapshot_from_row(row_id, bids_text, asks_text, timestamp, trades_by_timestamp)
if snapshot is not None:
# Calculate metrics for this snapshot
obi = MetricCalculator.calculate_obi(snapshot)
trades = trades_by_timestamp.get(int(timestamp), [])
volume_delta = MetricCalculator.calculate_volume_delta(trades)
current_cvd = MetricCalculator.calculate_cvd(current_cvd, volume_delta)
best_bid, best_ask = MetricCalculator.get_best_bid_ask(snapshot)
# Create metric record
metric = Metric(
snapshot_id=row_id,
timestamp=int(timestamp),
obi=obi,
cvd=current_cvd,
best_bid=best_bid,
best_ask=best_ask
)
metrics_batch.append(metric)
# Add snapshot to book (for compatibility)
self.book.add_snapshot(snapshot)
# Insert metrics batch when it reaches batch_size
if len(metrics_batch) >= batch_size:
metrics_repo.insert_metrics_batch(conn, metrics_batch)
conn.commit()
metrics_batch = []
processed += 1
# Report progress
current_time = time.time()
if processed % log_every == 0 and current_time - last_report_time > 1.0:
logging.info(
f"{processed / total_rows * 100:.1f}% - OBI: {metrics_batch[-1].obi if metrics_batch else 'N/A':.3f} - "
f"CVD: {current_cvd:.1f} - {processed/(current_time-start_time):.1f} rows/sec"
)
last_report_time = current_time
# Insert remaining metrics
if metrics_batch:
metrics_repo.insert_metrics_batch(conn, metrics_batch)
conn.commit()
def _create_snapshots_from_rows(self, rows_iter: Iterator[Tuple[int, str, str, int]], trades_by_timestamp: Dict[int, List[Trade]], total_rows: int) -> None:
"""Create BookSnapshot instances from database rows and add them to the book.
Args:
rows_iter: Iterator yielding (id, bids_text, asks_text, timestamp)
trades_by_timestamp: Dictionary mapping timestamps to lists of trades
total_rows: Total number of rows in the book table
"""
# Get reference to the book
book = self.book
# Set batch size and logging frequency
log_every = max(1, total_rows // 20)
processed = 0
start_time = time.time()
last_report_time = start_time
for row_id, bids_text, asks_text, timestamp in rows_iter:
snapshot = self._snapshot_from_row(row_id, bids_text, asks_text, timestamp, trades_by_timestamp)
if snapshot is not None:
book.add_snapshot(snapshot)
processed += 1
# Report progress
current_time = time.time()
if processed % log_every == 0 and current_time - last_report_time > 1.0:
logging.info(
f"{processed / total_rows * 100:.1f}% - asks {len(self.book.snapshots[-1].asks) if self.book.snapshots else 0} - "
f"bids {len(self.book.snapshots[-1].bids) if self.book.snapshots else 0} - "
f"{processed/(current_time-start_time):.1f} rows/sec"
)
last_report_time = current_time
def _snapshot_from_row(
self,
row_id: int,
bids_text: str,
asks_text: str,
timestamp: int,
trades_by_timestamp: Dict[int, List[Trade]],
) -> Optional[BookSnapshot]:
"""Create a `BookSnapshot` from a single DB row and attached trades.
Returns None if the snapshot has no bids or asks after parsing.
"""
timestamp_int = int(timestamp)
snapshot = BookSnapshot(
id=row_id,
timestamp=timestamp_int,
bids={},
asks={},
trades=trades_by_timestamp.get(timestamp_int, []),
)
self._parser.parse_side(bids_text, snapshot.bids)
self._parser.parse_side(asks_text, snapshot.asks)
if snapshot.bids and snapshot.asks:
return snapshot
return None
def _parse_orderbook_side(self, text: str, side_dict: Dict[float, OrderbookLevel]) -> None:
"""Compatibility wrapper delegating to `OrderbookParser.parse_side`."""
self._parser.parse_side(text, side_dict)
# The following helper was previously used, kept here for reference
# and potential future extensions. It has been superseded by repository
# methods for data access and is intentionally not used.