CryptoMarketParser/article_analyzer.py

80 lines
2.6 KiB
Python
Raw Normal View History

from enum import Enum
2025-03-22 04:52:17 +08:00
from finBERT.finbert import predict
from transformers import AutoModelForSequenceClassification
import ollama
from pydantic import BaseModel
2025-03-19 15:28:30 +08:00
import markdownify
class Category(str, Enum):
REGULATORY_NEWS = "Regulatory News"
INSTITUTIONAL_ADOPTION = "Institutional Adoption"
MARKET_SENTIMENT = "Market Sentiment"
MACROECONOMIC_FACTORS = "Macroeconomic Factors"
SECURITY_HACKS = "Security & Hacks"
TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments"
WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity"
class Sentiment(str, Enum):
POSITIVE = "Positive"
NEUTRAL = "Neutral"
NEGATIVE = "Negative"
class ArticleClassification(BaseModel):
category: Category
sentiment: Sentiment
class ArticleAnalyzer:
def __init__(self):
2025-03-22 04:52:17 +08:00
self.model = AutoModelForSequenceClassification.from_pretrained('args.model_path', num_labels=3, cache_dir=None)
self.base_prompt = """
Classify the following article into one of these categories:
- Regulatory News
- Institutional Adoption
- Market Sentiment
- Macroeconomic Factors
- Security & Hacks
- Technological Developments
- Whale & Exchange Activity
Also, assign a sentiment (Positive, Neutral or Negative)
"""
2025-03-19 15:28:30 +08:00
print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")
@staticmethod
def convert_to_markdown(html_content: str) -> str:
"""
Convert HTML content to Markdown format.
Args:
html_content: Cleaned HTML content
2025-03-19 15:28:30 +08:00
Returns:
Markdown content
"""
return markdownify.markdownify(
html_content,
strip=["script", "style", "img", "svg"],
strip_tags=True,
heading_style="atx",
code_block=True
)
def classify_article_llm(self, article_text):
prompt = f"""{self.base_prompt}
ARTICLE: {article_text}
OUTPUT FORMAT:
Category: <category>
Sentiment: <sentiment>
"""
response = ollama.chat(model="llama3.2",
messages=[{"role": "user", "content": prompt}],
format=ArticleClassification.model_json_schema())
2025-03-19 15:28:30 +08:00
return response['message']['content']
def classify_article_finbert(self, article_html):
article_md = self.convert_to_markdown(article_html)
2025-03-22 04:52:17 +08:00
results = predict(article_md, model=self.model, use_gpu=True)
2025-03-22 04:46:48 +08:00
return results