From a1cd2a0b06e99a8c669933bf9fa437f469705266 Mon Sep 17 00:00:00 2001
From: Simon Moisy <simon.moisy@tutanota.com>
Date: Tue, 18 Mar 2025 10:55:35 +0800
Subject: [PATCH] first attempt with an article analyzer using ollama and
 (structured output)

---
 article_analyzer.py      | 47 ++++++++++++++++++++++++++++++++++++++++
 main.py                  | 16 --------------
 main_article_analyzer.py | 24 ++++++++++++++++++++
 main_price_predictor.py  |  8 +++++++
 main_trend_analysis.py   |  6 +++++
 5 files changed, 85 insertions(+), 16 deletions(-)
 create mode 100644 article_analyzer.py
 delete mode 100644 main.py
 create mode 100644 main_article_analyzer.py
 create mode 100644 main_price_predictor.py
 create mode 100644 main_trend_analysis.py
diff --git a/article_analyzer.py b/article_analyzer.py
new file mode 100644
index 0000000..bbef80f
--- /dev/null
+++ b/article_analyzer.py
@@ -0,0 +1,47 @@
+from enum import Enum
+
+import ollama
+from pydantic import BaseModel
+
+class Category(str, Enum):
+    REGULATORY_NEWS = "Regulatory News"
+    INSTITUTIONAL_ADOPTION = "Institutional Adoption"
+    MARKET_SENTIMENT = "Market Sentiment"
+    MACROECONOMIC_FACTORS = "Macroeconomic Factors"
+    SECURITY_HACKS = "Security & Hacks"
+    TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments"
+    WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity"
+
+class ArticleClassification(BaseModel):
+    category: Category
+    sentiment: int
+
+class ArticleAnalyzer:
+    def __init__(self):
+        self.base_prompt = """
+            Classify the following article into one of these categories: 
+            - Regulatory News
+            - Institutional Adoption
+            - Market Sentiment
+            - Macroeconomic Factors
+            - Security & Hacks
+            - Technological Developments
+            - Whale & Exchange Activity
+
+            Also, assign a sentiment (1 for Positive, -1 for Negative, or 0 for Neutral).
+        """
+        print(f"This JSON model is going to be used for structured ouput classification : {ArticleClassification.model_json_schema()}")
+
+    def classify_article(self, article_text):
+        prompt = f"""{self.base_prompt}
+
+        ARTICLE: {article_text}
+
+        OUTPUT FORMAT:
+        Category: <category>
+        Sentiment: <sentiment>
+        """
+        response = ollama.chat(model="llama3.2",
+                               messages=[{"role": "user", "content": prompt}],
+                               format=ArticleClassification.model_json_schema())
+        return response['message']['content']
\ No newline at end of file
diff --git a/main.py b/main.py
deleted file mode 100644
index 01db0d2..0000000
--- a/main.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from BitcoinPricePredictor import BitcoinPricePredictor
-
-if __name__ == "__main__":
-    # For daily predictions (default)
-    predictor_daily = BitcoinPricePredictor(db_path='bitcoin_historical_data.db', timeframe='H')
-    
-    # For weekly predictions
-    # predictor_weekly = BitcoinPricePredictor(db_path='bitcoin_historical_data.db', timeframe='W')
-    
-    # Choose which predictor to use
-    predictor = predictor_daily
-    
-    predictor.load_and_prepare_data()
-    predictor.train_model()
-    predictor.evaluate_model()
-    predictor.plot_history()
diff --git a/main_article_analyzer.py b/main_article_analyzer.py
new file mode 100644
index 0000000..a13a9d8
--- /dev/null
+++ b/main_article_analyzer.py
@@ -0,0 +1,24 @@
+import os
+
+from article_analyzer import ArticleAnalyzer
+
+def read_html_files(folder_path):
+    html_contents = {}
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            if file.endswith(".html"):
+                file_path = os.path.join(root, file)
+                with open(file_path, "r", encoding="utf-8") as f:
+                    html_contents[file_path] = f.read()
+    return html_contents
+
+
+if __name__ == "__main__":
+    analyzer = ArticleAnalyzer()
+
+    html_files = read_html_files("./data")
+    print(f"Parsed {len(html_files)} html files")
+
+    for file, content in html_files.items():
+        result = analyzer.classify_article(content)
+        print(f"article [{file}] - analyzed as [{result}]\n")
diff --git a/main_price_predictor.py b/main_price_predictor.py
new file mode 100644
index 0000000..d5bf9dc
--- /dev/null
+++ b/main_price_predictor.py
@@ -0,0 +1,8 @@
+from BitcoinPricePredictor import BitcoinPricePredictor
+
+if __name__ == "__main__":
+    predictor = BitcoinPricePredictor(db_path='bitcoin_historical_data.db', timeframe='H')
+    predictor.load_data()
+    predictor.train_model()
+    predictor.evaluate_model()
+    predictor.plot_history()
diff --git a/main_trend_analysis.py b/main_trend_analysis.py
new file mode 100644
index 0000000..887732f
--- /dev/null
+++ b/main_trend_analysis.py
@@ -0,0 +1,6 @@
+from bitcoin_trend_analysis import BitcoinTrendAnalysis
+
+if __name__ == "__main__":
+    ma = BitcoinTrendAnalysis(db_path='bitcoin_historical_data.db')
+    ma.load_data()
+    ma.analyze_trends_peaks(distance=1)