Add initial implementation of the Orderflow Backtest System with OBI and CVD metrics integration, including core modules for storage, strategies, and visualization. Introduced persistent metrics storage in SQLite, optimized memory usage, and enhanced documentation.

2025-08-26 17:22:07 +08:00
parent 63f723820a
commit fa6df78c1e
52 changed files with 7039 additions and 1 deletions
--- a/repositories/init.py
+++ b/repositories/init.py
@@ -0,0 +1,7 @@
+"""Repository layer for data access implementations (e.g., SQLite).
+
+This package contains concrete repositories used by the `Storage` orchestrator
+to read persisted orderflow data.
+"""
+
+
--- a/repositories/sqlite_metrics_repository.py
+++ b/repositories/sqlite_metrics_repository.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from pathlib import Path
+import sqlite3
+import logging
+from typing import List, Dict, Tuple
+
+from .sqlite_repository import SQLiteOrderflowRepository
+from models import Metric
+
+
+class SQLiteMetricsRepository(SQLiteOrderflowRepository):
+    """Write-enabled repository for storing and loading metrics data alongside orderflow data."""
+
+    def create_metrics_table(self, conn: sqlite3.Connection) -> None:
+        """Create the metrics table with proper indexes and foreign key constraints.
+        
+        Args:
+            conn: Active SQLite database connection.
+        """
+        try:
+            # Create metrics table following PRD schema
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS metrics (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    snapshot_id INTEGER NOT NULL,
+                    timestamp TEXT NOT NULL,
+                    obi REAL NOT NULL,
+                    cvd REAL NOT NULL,
+                    best_bid REAL,
+                    best_ask REAL,
+                    FOREIGN KEY (snapshot_id) REFERENCES book(id)
+                )
+            """)
+            
+            # Create indexes for efficient querying
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_metrics_timestamp ON metrics(timestamp)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_metrics_snapshot_id ON metrics(snapshot_id)")
+            
+            conn.commit()
+            logging.info("Metrics table and indexes created successfully")
+            
+        except sqlite3.Error as e:
+            logging.error(f"Error creating metrics table: {e}")
+            raise
+
+    def table_exists(self, conn: sqlite3.Connection, table_name: str) -> bool:
+        """Check if a table exists in the database.
+        
+        Args:
+            conn: Active SQLite database connection.
+            table_name: Name of the table to check.
+            
+        Returns:
+            True if table exists, False otherwise.
+        """
+        try:
+            cursor = conn.cursor()
+            cursor.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name=?", 
+                (table_name,)
+            )
+            return cursor.fetchone() is not None
+        except sqlite3.Error as e:
+            logging.error(f"Error checking if table {table_name} exists: {e}")
+            return False
+
+    def insert_metrics_batch(self, conn: sqlite3.Connection, metrics: List[Metric]) -> None:
+        """Insert multiple metrics in a single batch operation for performance.
+        
+        Args:
+            conn: Active SQLite database connection.
+            metrics: List of Metric objects to insert.
+        """
+        if not metrics:
+            return
+            
+        try:
+            # Prepare batch data following existing batch pattern
+            batch_data = [
+                (m.snapshot_id, m.timestamp, m.obi, m.cvd, m.best_bid, m.best_ask)
+                for m in metrics
+            ]
+            
+            # Use executemany for batch insertion
+            conn.executemany(
+                "INSERT INTO metrics (snapshot_id, timestamp, obi, cvd, best_bid, best_ask) VALUES (?, ?, ?, ?, ?, ?)",
+                batch_data
+            )
+            
+            logging.debug(f"Inserted {len(metrics)} metrics records")
+            
+        except sqlite3.Error as e:
+            logging.error(f"Error inserting metrics batch: {e}")
+            raise
+
+    def load_metrics_by_timerange(self, conn: sqlite3.Connection, start_timestamp: int, end_timestamp: int) -> List[Metric]:
+        """Load metrics within a specified timestamp range.
+        
+        Args:
+            conn: Active SQLite database connection.
+            start_timestamp: Start of the time range (inclusive).
+            end_timestamp: End of the time range (inclusive).
+            
+        Returns:
+            List of Metric objects ordered by timestamp.
+        """
+        try:
+            cursor = conn.cursor()
+            cursor.execute(
+                "SELECT snapshot_id, timestamp, obi, cvd, best_bid, best_ask FROM metrics WHERE timestamp >= ? AND timestamp <= ? ORDER BY timestamp ASC",
+                (start_timestamp, end_timestamp)
+            )
+            
+            metrics = []
+            for batch in iter(lambda: cursor.fetchmany(5000), []):
+                for snapshot_id, timestamp, obi, cvd, best_bid, best_ask in batch:
+                    metric = Metric(
+                        snapshot_id=int(snapshot_id),
+                        timestamp=int(timestamp),
+                        obi=float(obi),
+                        cvd=float(cvd),
+                        best_bid=float(best_bid) if best_bid is not None else None,
+                        best_ask=float(best_ask) if best_ask is not None else None,
+                    )
+                    metrics.append(metric)
+                    
+            return metrics
+            
+        except sqlite3.Error as e:
+            logging.error(f"Error loading metrics by timerange: {e}")
+            return []
--- a/repositories/sqlite_repository.py
+++ b/repositories/sqlite_repository.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Iterator, List, Tuple
+import sqlite3
+import logging
+
+from models import Trade
+
+
+class SQLiteOrderflowRepository:
+    """Read-only repository for loading orderflow data from a SQLite database."""
+
+    def __init__(self, db_path: Path) -> None:
+        self.db_path = db_path
+
+    def connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(str(self.db_path))
+        conn.execute("PRAGMA journal_mode = OFF")
+        conn.execute("PRAGMA synchronous = OFF")
+        conn.execute("PRAGMA cache_size = 100000")
+        conn.execute("PRAGMA temp_store = MEMORY")
+        conn.execute("PRAGMA mmap_size = 30000000000")
+        return conn
+
+    def count_rows(self, conn: sqlite3.Connection, table: str) -> int:
+        allowed_tables = {"book", "trades"}
+        if table not in allowed_tables:
+            raise ValueError(f"Unsupported table name: {table}")
+        try:
+            row = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
+            return int(row[0]) if row and row[0] is not None else 0
+        except sqlite3.Error as e:
+            logging.error(f"Error counting rows in table {table}: {e}")
+            return 0
+
+    def load_trades_by_timestamp(self, conn: sqlite3.Connection) -> Dict[int, List[Trade]]:
+        trades_by_timestamp: Dict[int, List[Trade]] = {}
+        try:
+            cursor = conn.cursor()
+            cursor.execute(
+                "SELECT id, trade_id, price, size, side, timestamp FROM trades ORDER BY timestamp ASC"
+            )
+            for batch in iter(lambda: cursor.fetchmany(5000), []):
+                for id_, trade_id, price, size, side, ts in batch:
+                    timestamp_int = int(ts)
+                    trade = Trade(
+                        id=id_,
+                        trade_id=float(trade_id),
+                        price=float(price),
+                        size=float(size),
+                        side=str(side),
+                        timestamp=timestamp_int,
+                    )
+                    if timestamp_int not in trades_by_timestamp:
+                        trades_by_timestamp[timestamp_int] = []
+                    trades_by_timestamp[timestamp_int].append(trade)
+            return trades_by_timestamp
+        except sqlite3.Error as e:
+            logging.error(f"Error loading trades: {e}")
+            return {}
+
+    def iterate_book_rows(self, conn: sqlite3.Connection) -> Iterator[Tuple[int, str, str, int]]:
+        cursor = conn.cursor()
+        cursor.execute("SELECT id, bids, asks, timestamp FROM book ORDER BY timestamp ASC")
+        while True:
+            rows = cursor.fetchmany(5000)
+            if not rows:
+                break
+            for row in rows:
+                yield row  # (id, bids, asks, timestamp)
+
+