from enum import Enum from transformers import pipeline import ollama from pydantic import BaseModel import markdownify class Category(str, Enum): REGULATORY_NEWS = "Regulatory News" INSTITUTIONAL_ADOPTION = "Institutional Adoption" MARKET_SENTIMENT = "Market Sentiment" MACROECONOMIC_FACTORS = "Macroeconomic Factors" SECURITY_HACKS = "Security & Hacks" TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments" WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity" class Sentiment(str, Enum): POSITIVE = "Positive" NEUTRAL = "Neutral" NEGATIVE = "Negative" class ArticleClassification(BaseModel): category: Category sentiment: Sentiment class ArticleAnalyzer: def __init__(self): self.classifier = pipeline("text-classification", model="ProsusAI/finbert") self.base_prompt = """ Classify the following article into one of these categories: - Regulatory News - Institutional Adoption - Market Sentiment - Macroeconomic Factors - Security & Hacks - Technological Developments - Whale & Exchange Activity Also, assign a sentiment (Positive, Neutral or Negative) """ print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}") @staticmethod def convert_to_markdown(html_content: str) -> str: """ Convert HTML content to Markdown format. Args: html_content: Cleaned HTML content Returns: Markdown content """ return markdownify.markdownify( html_content, strip=["script", "style", "img", "svg"], strip_tags=True, heading_style="atx", code_block=True ) def classify_article_llm(self, article_text): prompt = f"""{self.base_prompt} ARTICLE: {article_text} OUTPUT FORMAT: Category: Sentiment: """ response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}], format=ArticleClassification.model_json_schema()) return response['message']['content'] def classify_article_finbert(self, article_html): article_md = self.convert_to_markdown(article_html) chunk_size = 512 chunks = [article_md[i:i + chunk_size] for i in range(0, len(article_md), chunk_size)] results = [] for chunk in chunks: result = self.classifier(chunk) results.append(result) return results