78 lines
2.5 KiB
Python
78 lines
2.5 KiB
Python
from enum import Enum
|
|
from transformers import pipeline
|
|
import ollama
|
|
from pydantic import BaseModel
|
|
import markdownify
|
|
|
|
class Category(str, Enum):
|
|
REGULATORY_NEWS = "Regulatory News"
|
|
INSTITUTIONAL_ADOPTION = "Institutional Adoption"
|
|
MARKET_SENTIMENT = "Market Sentiment"
|
|
MACROECONOMIC_FACTORS = "Macroeconomic Factors"
|
|
SECURITY_HACKS = "Security & Hacks"
|
|
TECHNOLOGICAL_DEVELOPMENTS = "Technological Developments"
|
|
WHALE_EXCHANGE_ACTIVITY = "Whale & Exchange Activity"
|
|
|
|
class Sentiment(str, Enum):
|
|
POSITIVE = "Positive"
|
|
NEUTRAL = "Neutral"
|
|
NEGATIVE = "Negative"
|
|
|
|
class ArticleClassification(BaseModel):
|
|
category: Category
|
|
sentiment: Sentiment
|
|
|
|
class ArticleAnalyzer:
|
|
def __init__(self):
|
|
self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
|
|
self.base_prompt = """
|
|
Classify the following article into one of these categories:
|
|
- Regulatory News
|
|
- Institutional Adoption
|
|
- Market Sentiment
|
|
- Macroeconomic Factors
|
|
- Security & Hacks
|
|
- Technological Developments
|
|
- Whale & Exchange Activity
|
|
|
|
Also, assign a sentiment (Positive, Neutral or Negative)
|
|
"""
|
|
print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")
|
|
|
|
@staticmethod
|
|
def convert_to_markdown(html_content: str) -> str:
|
|
"""
|
|
Convert HTML content to Markdown format.
|
|
|
|
Args:
|
|
html_content: Cleaned HTML content
|
|
|
|
Returns:
|
|
Markdown content
|
|
"""
|
|
return markdownify.markdownify(
|
|
html_content,
|
|
strip=["script", "style", "img", "svg"],
|
|
strip_tags=True,
|
|
heading_style="atx",
|
|
code_block=True
|
|
)
|
|
|
|
def classify_article_llm(self, article_text):
|
|
prompt = f"""{self.base_prompt}
|
|
|
|
ARTICLE: {article_text}
|
|
|
|
OUTPUT FORMAT:
|
|
Category: <category>
|
|
Sentiment: <sentiment>
|
|
"""
|
|
response = ollama.chat(model="llama3.2",
|
|
messages=[{"role": "user", "content": prompt}],
|
|
format=ArticleClassification.model_json_schema())
|
|
return response['message']['content']
|
|
|
|
def classify_article_finbert(self, article_html):
|
|
article_md = self.convert_to_markdown(article_html)
|
|
result = self.classifier(article_md)
|
|
return result |