CryptoMarketParser/article_analyzer.py

85 lines
2.7 KiB
Python
Raw Normal View History

from enum import Enum
2025-03-19 15:28:30 +08:00
from transformers import pipeline
import ollama
from pydantic import BaseModel
2025-03-19 15:28:30 +08:00
import markdownify
class Category(str, Enum):
REGULATORY_NEWS = "Regulatory News"
INSTITUTIONAL_ADOPTION = "Institutional Adoption"
MARKET_SENTIMENT = "Market Sentiment"
MACROECONOMIC_FACTORS = "Macroeconomic Factors"
SECURITY_HACKS = "Security & Hacks"
TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments"
WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity"
class Sentiment(str, Enum):
POSITIVE = "Positive"
NEUTRAL = "Neutral"
NEGATIVE = "Negative"
class ArticleClassification(BaseModel):
category: Category
sentiment: Sentiment
class ArticleAnalyzer:
def __init__(self):
2025-03-19 15:28:30 +08:00
self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
self.base_prompt = """
Classify the following article into one of these categories:
- Regulatory News
- Institutional Adoption
- Market Sentiment
- Macroeconomic Factors
- Security & Hacks
- Technological Developments
- Whale & Exchange Activity
Also, assign a sentiment (Positive, Neutral or Negative)
"""
2025-03-19 15:28:30 +08:00
print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")
@staticmethod
def convert_to_markdown(html_content: str) -> str:
"""
Convert HTML content to Markdown format.
Args:
html_content: Cleaned HTML content
2025-03-19 15:28:30 +08:00
Returns:
Markdown content
"""
return markdownify.markdownify(
html_content,
strip=["script", "style", "img", "svg"],
strip_tags=True,
heading_style="atx",
code_block=True
)
def classify_article_llm(self, article_text):
prompt = f"""{self.base_prompt}
ARTICLE: {article_text}
OUTPUT FORMAT:
Category: <category>
Sentiment: <sentiment>
"""
response = ollama.chat(model="llama3.2",
messages=[{"role": "user", "content": prompt}],
format=ArticleClassification.model_json_schema())
2025-03-19 15:28:30 +08:00
return response['message']['content']
def classify_article_finbert(self, article_html):
article_md = self.convert_to_markdown(article_html)
2025-03-22 04:46:48 +08:00
chunk_size = 512
chunks = [article_md[i:i + chunk_size] for i in range(0, len(article_md), chunk_size)]
results = []
for chunk in chunks:
result = self.classifier(chunk)
results.append(result)
return results