article analyzer with finbert working

2025-03-19 15:28:30 +08:00
parent 7c5602543d
commit e1465539d2
7 changed files with 1975 additions and 55 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/article_analyzer.py
+++ b/article_analyzer.py
@@ -1,7 +1,8 @@
 from enum import Enum
-
+from transformers import pipeline
 import ollama
 from pydantic import BaseModel
+import markdownify

 class Category(str, Enum):
    REGULATORY_NEWS = "Regulatory News"
@@ -23,6 +24,7 @@ class ArticleClassification(BaseModel):

 class ArticleAnalyzer:
    def __init__(self):
+        self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
        self.base_prompt = """
            Classify the following article into one of these categories: 
            - Regulatory News
@@ -35,9 +37,28 @@ class ArticleAnalyzer:

            Also, assign a sentiment (Positive, Neutral or Negative)
        """
-        print(f"This JSON model is going to be used for structured ouput classification : {ArticleClassification.model_json_schema()}")
+        print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")

-    def classify_article(self, article_text):
+    @staticmethod
+    def convert_to_markdown(html_content: str) -> str:
+        """
+        Convert HTML content to Markdown format.
+
+        Args:
+            html_content: Cleaned HTML content
+
+        Returns:
+            Markdown content
+        """
+        return markdownify.markdownify(
+            html_content,
+            strip=["script", "style", "img", "svg"],
+            strip_tags=True,
+            heading_style="atx",
+            code_block=True
+        )
+
+    def classify_article_llm(self, article_text):
        prompt = f"""{self.base_prompt}

        ARTICLE: {article_text}
@@ -49,4 +70,9 @@ class ArticleAnalyzer:
        response = ollama.chat(model="llama3.2",
                               messages=[{"role": "user", "content": prompt}],
                               format=ArticleClassification.model_json_schema())
-        return response['message']['content']
+        return response['message']['content']
+
+    def classify_article_finbert(self, article_html):
+        article_md = self.convert_to_markdown(article_html)
+        result = self.classifier(article_md)
+        return result
--- a/bitcoin_trend_analysis.py
+++ b/bitcoin_trend_analysis.py
@@ -56,67 +56,25 @@ class BitcoinTrendAnalysis:
        smooth_prices = pd.Series(prices).rolling(window=smoothing_window).mean()

        fig, ax = plt.subplots(figsize=(14, 7))
-        plt.subplots_adjust(bottom=0.25)  # Space for widgets
-        ax2 = ax.twinx()  # Secondary axis for prominence
+        plt.subplots_adjust(bottom=0.25)

-        # Initial peaks and prominences
        peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices, window=window, factor=prominence_factor,
                                                               distance=distance)

-        # Plot main price curve
-        price_line, = ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
+        ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
+        ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^', label='Local Maxima')
+        ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v', label='Local Minima')

-        # Scatter plots for peaks/valleys
-        peaks_plot = ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^',
-                                label='Local Maxima')
-        valleys_plot = ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v',
-                                  label='Local Minima')
-
-        # Prominence line on secondary y-axis
-        prominence_line, = ax2.plot(self.df.index, prominences, color="purple", linestyle="dashed", alpha=0.7,
-                                    label="Prominence")
-
-        ax2.set_ylabel("Prominence")
+        for peak, valley in zip(peaks, valleys):
+            ax.plot([self.df.index[peak], self.df.index[valley]], [smooth_prices[peak], smooth_prices[valley]],
+                    color='orange', lw=1)

        ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={prominence_factor}')
        ax.set_xlabel('Date')
        ax.set_ylabel('Price')
        ax.legend()
-        ax2.legend(loc="upper right")
        ax.grid(True)

-        # Slider setup
-        ax_slider = plt.axes([0.2, 0.05, 0.65, 0.03])  # Positioning of slider
-        slider = Slider(ax_slider, 'Prom Factor', 0.1, 2.0, valinit=prominence_factor, valstep=0.05)
-
-        # Update function for slider
-        def update_plot(factor):
-            # Recalculate peaks and prominences
-            peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices.to_numpy(), window=window,
-                                                                   factor=factor, distance=distance)
-            print(len(peaks))
-            # Update scatter points for peaks
-            peaks_plot.set_offsets(np.column_stack([
-                (self.df.index[peaks] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
-                smooth_prices[peaks]
-            ]))
-
-            # Update scatter points for valleys
-            valleys_plot.set_offsets(np.column_stack([
-                (self.df.index[valleys] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
-                smooth_prices[valleys]
-            ]))
-
-            # Update prominence line
-            prominence_line.set_ydata(prominences)
-
-            # Update the title to reflect the current prominence factor
-            ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={factor}')
-
-            # Redraw the figure
-            fig.canvas.draw_idle()
-
-        slider.on_changed(update_plot)  # Update plot when slider changes
        plt.show()

    def analyze_trends_linear_regression(self):
--- a/main_article_analyzer.py
+++ b/main_article_analyzer.py
@@ -20,5 +20,5 @@ if __name__ == "__main__":
    print(f"Parsed {len(html_files)} html files")

    for file, content in html_files.items():
-        result = analyzer.classify_article(content)
+        result = analyzer.classify_article_finbert(content)
        print(f"article [{file}] - analyzed as [{result}]\n")
--- a/main_trend_analysis.py
+++ b/main_trend_analysis.py
@@ -3,4 +3,4 @@ from bitcoin_trend_analysis import BitcoinTrendAnalysis
 if __name__ == "__main__":
    ma = BitcoinTrendAnalysis(db_path='bitcoin_historical_data.db')
    ma.load_data()
-    ma.analyze_trends_peaks(distance=1)
+    ma.analyze_trends_peaks(distance=1, prominence_factor=0.1)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,30 @@
+[project]
+name = "cryptomarketparser"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Simon Moisy",email = "simon.moisy@tutanota.com"}
+]
+readme = "README.md"
+requires-python = ">=3.10,<4.0"
+dependencies = [
+    "numpy (>=2.2.3,<3.0.0)",
+    "pandas (>=2.2.3,<3.0.0)",
+    "sqlalchemy (>=2.0.39,<3.0.0)",
+    "scipy (>=1.15.2,<2.0.0)",
+    "matplotlib (>=3.10.1,<4.0.0)",
+    "scikit-learn (>=1.6.1,<2.0.0)",
+    "ollama (>=0.4.7,<0.5.0)",
+    "transformers (>=4.49.0,<5.0.0)",
+    "markdownify (>=1.1.0,<2.0.0)"
+]
+
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[[tool.poetry.source]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cu121"
+priority = "explicit"