article analyzer with finbert working

2025-03-19 15:28:30 +08:00
parent 7c5602543d
commit e1465539d2
7 changed files with 1975 additions and 55 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
 # Default ignored files
 /shelf/
 /workspace.xml
--- a/article_analyzer.py
+++ b/article_analyzer.py
@@ -1,7 +1,8 @@
 from enum import Enum
-
+from transformers import pipeline
 import ollama
 from pydantic import BaseModel
 import markdownify
 class Category(str, Enum):
    REGULATORY_NEWS = "Regulatory News"
@@ -23,6 +24,7 @@ class ArticleClassification(BaseModel):
 class ArticleAnalyzer:
    def __init__(self):
        self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
        self.base_prompt = """
            Classify the following article into one of these categories: 
            - Regulatory News
@@ -35,9 +37,28 @@ class ArticleAnalyzer:
            Also, assign a sentiment (Positive, Neutral or Negative)
        """
-        print(f"This JSON model is going to be used for structured ouput classification : {ArticleClassification.model_json_schema()}")
+        print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")
-    def classify_article(self, article_text):
+    @staticmethod
    def convert_to_markdown(html_content: str) -> str:
        """
        Convert HTML content to Markdown format.
        Args:
            html_content: Cleaned HTML content
        Returns:
            Markdown content
        """
        return markdownify.markdownify(
            html_content,
            strip=["script", "style", "img", "svg"],
            strip_tags=True,
            heading_style="atx",
            code_block=True
        )
    def classify_article_llm(self, article_text):
        prompt = f"""{self.base_prompt}
        ARTICLE: {article_text}
@@ -49,4 +70,9 @@ class ArticleAnalyzer:
        response = ollama.chat(model="llama3.2",
                               messages=[{"role": "user", "content": prompt}],
                               format=ArticleClassification.model_json_schema())
-        return response['message']['content']
+        return response['message']['content']
    def classify_article_finbert(self, article_html):
        article_md = self.convert_to_markdown(article_html)
        result = self.classifier(article_md)
        return result
--- a/bitcoin_trend_analysis.py
+++ b/bitcoin_trend_analysis.py
@@ -56,67 +56,25 @@ class BitcoinTrendAnalysis:
        smooth_prices = pd.Series(prices).rolling(window=smoothing_window).mean()
        fig, ax = plt.subplots(figsize=(14, 7))
-        plt.subplots_adjust(bottom=0.25)  # Space for widgets
+        plt.subplots_adjust(bottom=0.25)
        ax2 = ax.twinx()  # Secondary axis for prominence
        # Initial peaks and prominences
        peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices, window=window, factor=prominence_factor,
                                                               distance=distance)
-        # Plot main price curve
+        ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
-        price_line, = ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
+        ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^', label='Local Maxima')
        ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v', label='Local Minima')
-        # Scatter plots for peaks/valleys
+        for peak, valley in zip(peaks, valleys):
-        peaks_plot = ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^',
+            ax.plot([self.df.index[peak], self.df.index[valley]], [smooth_prices[peak], smooth_prices[valley]],
-                                label='Local Maxima')
+                    color='orange', lw=1)
        valleys_plot = ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v',
                                  label='Local Minima')
        # Prominence line on secondary y-axis
        prominence_line, = ax2.plot(self.df.index, prominences, color="purple", linestyle="dashed", alpha=0.7,
                                    label="Prominence")
        ax2.set_ylabel("Prominence")
        ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={prominence_factor}')
        ax.set_xlabel('Date')
        ax.set_ylabel('Price')
        ax.legend()
        ax2.legend(loc="upper right")
        ax.grid(True)
        # Slider setup
        ax_slider = plt.axes([0.2, 0.05, 0.65, 0.03])  # Positioning of slider
        slider = Slider(ax_slider, 'Prom Factor', 0.1, 2.0, valinit=prominence_factor, valstep=0.05)
        # Update function for slider
        def update_plot(factor):
            # Recalculate peaks and prominences
            peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices.to_numpy(), window=window,
                                                                   factor=factor, distance=distance)
            print(len(peaks))
            # Update scatter points for peaks
            peaks_plot.set_offsets(np.column_stack([
                (self.df.index[peaks] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
                smooth_prices[peaks]
            ]))
            # Update scatter points for valleys
            valleys_plot.set_offsets(np.column_stack([
                (self.df.index[valleys] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
                smooth_prices[valleys]
            ]))
            # Update prominence line
            prominence_line.set_ydata(prominences)
            # Update the title to reflect the current prominence factor
            ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={factor}')
            # Redraw the figure
            fig.canvas.draw_idle()
        slider.on_changed(update_plot)  # Update plot when slider changes
        plt.show()
    def analyze_trends_linear_regression(self):
--- a/main_article_analyzer.py
+++ b/main_article_analyzer.py
@@ -20,5 +20,5 @@ if __name__ == "__main__":
    print(f"Parsed {len(html_files)} html files")
    for file, content in html_files.items():
-        result = analyzer.classify_article(content)
+        result = analyzer.classify_article_finbert(content)
        print(f"article [{file}] - analyzed as [{result}]\n")
--- a/main_trend_analysis.py
+++ b/main_trend_analysis.py
@@ -3,4 +3,4 @@ from bitcoin_trend_analysis import BitcoinTrendAnalysis
 if __name__ == "__main__":
    ma = BitcoinTrendAnalysis(db_path='bitcoin_historical_data.db')
    ma.load_data()
-    ma.analyze_trends_peaks(distance=1)
+    ma.analyze_trends_peaks(distance=1, prominence_factor=0.1)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,30 @@
 [project]
 name = "cryptomarketparser"
 version = "0.1.0"
 description = ""
 authors = [
    {name = "Simon Moisy",email = "simon.moisy@tutanota.com"}
 ]
 readme = "README.md"
 requires-python = ">=3.10,<4.0"
 dependencies = [
    "numpy (>=2.2.3,<3.0.0)",
    "pandas (>=2.2.3,<3.0.0)",
    "sqlalchemy (>=2.0.39,<3.0.0)",
    "scipy (>=1.15.2,<2.0.0)",
    "matplotlib (>=3.10.1,<4.0.0)",
    "scikit-learn (>=1.6.1,<2.0.0)",
    "ollama (>=0.4.7,<0.5.0)",
    "transformers (>=4.49.0,<5.0.0)",
    "markdownify (>=1.1.0,<2.0.0)"
 ]
 [build-system]
 requires = ["poetry-core>=2.0.0,<3.0.0"]
 build-backend = "poetry.core.masonry.api"
 [[tool.poetry.source]]
 name = "pytorch"
 url = "https://download.pytorch.org/whl/cu121"
 priority = "explicit"