CryptoMarketParser/article_analyzer.py

from enum import Enum
from transformers import pipeline
import ollama
from pydantic import BaseModel
import markdownify

class Category(str, Enum):
    REGULATORY_NEWS = "Regulatory News"
    INSTITUTIONAL_ADOPTION = "Institutional Adoption"
    MARKET_SENTIMENT = "Market Sentiment"
    MACROECONOMIC_FACTORS = "Macroeconomic Factors"
    SECURITY_HACKS = "Security & Hacks"
    TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments"
    WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity"

class Sentiment(str, Enum):
    POSITIVE = "Positive"
    NEUTRAL = "Neutral"
    NEGATIVE = "Negative"

class ArticleClassification(BaseModel):
    category: Category
    sentiment: Sentiment

class ArticleAnalyzer:
    def __init__(self):
        self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
        self.base_prompt = """
            Classify the following article into one of these categories:
            - Regulatory News
            - Institutional Adoption
            - Market Sentiment
            - Macroeconomic Factors
            - Security & Hacks
            - Technological Developments
            - Whale & Exchange Activity

            Also, assign a sentiment (Positive, Neutral or Negative)
        """
        print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")

    @staticmethod
    def convert_to_markdown(html_content: str) -> str:
        """
        Convert HTML content to Markdown format.

        Args:
            html_content: Cleaned HTML content

        Returns:
            Markdown content
        """
        return markdownify.markdownify(
            html_content,
            strip=["script", "style", "img", "svg"],
            strip_tags=True,
            heading_style="atx",
            code_block=True
        )

    def classify_article_llm(self, article_text):
        prompt = f"""{self.base_prompt}

        ARTICLE: {article_text}

        OUTPUT FORMAT:
        Category: <category>
        Sentiment: <sentiment>
        """
        response = ollama.chat(model="llama3.2",
                               messages=[{"role": "user", "content": prompt}],
                               format=ArticleClassification.model_json_schema())
        return response['message']['content']

    def classify_article_finbert(self, article_html):
        article_md = self.convert_to_markdown(article_html)
        result = self.classifier(article_md)
        return result