article analyzer with finbert working
This commit is contained in:
parent
7c5602543d
commit
e1465539d2
3
.idea/.gitignore
generated
vendored
Normal file
3
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
@ -1,7 +1,8 @@
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from transformers import pipeline
|
||||||
import ollama
|
import ollama
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
import markdownify
|
||||||
|
|
||||||
class Category(str, Enum):
|
class Category(str, Enum):
|
||||||
REGULATORY_NEWS = "Regulatory News"
|
REGULATORY_NEWS = "Regulatory News"
|
||||||
@ -23,6 +24,7 @@ class ArticleClassification(BaseModel):
|
|||||||
|
|
||||||
class ArticleAnalyzer:
|
class ArticleAnalyzer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.classifier = pipeline("text-classification", model="ProsusAI/finbert")
|
||||||
self.base_prompt = """
|
self.base_prompt = """
|
||||||
Classify the following article into one of these categories:
|
Classify the following article into one of these categories:
|
||||||
- Regulatory News
|
- Regulatory News
|
||||||
@ -35,9 +37,28 @@ class ArticleAnalyzer:
|
|||||||
|
|
||||||
Also, assign a sentiment (Positive, Neutral or Negative)
|
Also, assign a sentiment (Positive, Neutral or Negative)
|
||||||
"""
|
"""
|
||||||
print(f"This JSON model is going to be used for structured ouput classification : {ArticleClassification.model_json_schema()}")
|
print(f"This JSON model is going to be used for structured output classification : {ArticleClassification.model_json_schema()}")
|
||||||
|
|
||||||
def classify_article(self, article_text):
|
@staticmethod
|
||||||
|
def convert_to_markdown(html_content: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert HTML content to Markdown format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: Cleaned HTML content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown content
|
||||||
|
"""
|
||||||
|
return markdownify.markdownify(
|
||||||
|
html_content,
|
||||||
|
strip=["script", "style", "img", "svg"],
|
||||||
|
strip_tags=True,
|
||||||
|
heading_style="atx",
|
||||||
|
code_block=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def classify_article_llm(self, article_text):
|
||||||
prompt = f"""{self.base_prompt}
|
prompt = f"""{self.base_prompt}
|
||||||
|
|
||||||
ARTICLE: {article_text}
|
ARTICLE: {article_text}
|
||||||
@ -49,4 +70,9 @@ class ArticleAnalyzer:
|
|||||||
response = ollama.chat(model="llama3.2",
|
response = ollama.chat(model="llama3.2",
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
format=ArticleClassification.model_json_schema())
|
format=ArticleClassification.model_json_schema())
|
||||||
return response['message']['content']
|
return response['message']['content']
|
||||||
|
|
||||||
|
def classify_article_finbert(self, article_html):
|
||||||
|
article_md = self.convert_to_markdown(article_html)
|
||||||
|
result = self.classifier(article_md)
|
||||||
|
return result
|
||||||
@ -56,67 +56,25 @@ class BitcoinTrendAnalysis:
|
|||||||
smooth_prices = pd.Series(prices).rolling(window=smoothing_window).mean()
|
smooth_prices = pd.Series(prices).rolling(window=smoothing_window).mean()
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(14, 7))
|
fig, ax = plt.subplots(figsize=(14, 7))
|
||||||
plt.subplots_adjust(bottom=0.25) # Space for widgets
|
plt.subplots_adjust(bottom=0.25)
|
||||||
ax2 = ax.twinx() # Secondary axis for prominence
|
|
||||||
|
|
||||||
# Initial peaks and prominences
|
|
||||||
peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices, window=window, factor=prominence_factor,
|
peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices, window=window, factor=prominence_factor,
|
||||||
distance=distance)
|
distance=distance)
|
||||||
|
|
||||||
# Plot main price curve
|
ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
|
||||||
price_line, = ax.plot(self.df.index, smooth_prices, label='Bitcoin Smooth Price')
|
ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^', label='Local Maxima')
|
||||||
|
ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v', label='Local Minima')
|
||||||
|
|
||||||
# Scatter plots for peaks/valleys
|
for peak, valley in zip(peaks, valleys):
|
||||||
peaks_plot = ax.scatter(self.df.index[peaks], smooth_prices[peaks], color='green', s=100, marker='^',
|
ax.plot([self.df.index[peak], self.df.index[valley]], [smooth_prices[peak], smooth_prices[valley]],
|
||||||
label='Local Maxima')
|
color='orange', lw=1)
|
||||||
valleys_plot = ax.scatter(self.df.index[valleys], smooth_prices[valleys], color='red', s=100, marker='v',
|
|
||||||
label='Local Minima')
|
|
||||||
|
|
||||||
# Prominence line on secondary y-axis
|
|
||||||
prominence_line, = ax2.plot(self.df.index, prominences, color="purple", linestyle="dashed", alpha=0.7,
|
|
||||||
label="Prominence")
|
|
||||||
|
|
||||||
ax2.set_ylabel("Prominence")
|
|
||||||
|
|
||||||
ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={prominence_factor}')
|
ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={prominence_factor}')
|
||||||
ax.set_xlabel('Date')
|
ax.set_xlabel('Date')
|
||||||
ax.set_ylabel('Price')
|
ax.set_ylabel('Price')
|
||||||
ax.legend()
|
ax.legend()
|
||||||
ax2.legend(loc="upper right")
|
|
||||||
ax.grid(True)
|
ax.grid(True)
|
||||||
|
|
||||||
# Slider setup
|
|
||||||
ax_slider = plt.axes([0.2, 0.05, 0.65, 0.03]) # Positioning of slider
|
|
||||||
slider = Slider(ax_slider, 'Prom Factor', 0.1, 2.0, valinit=prominence_factor, valstep=0.05)
|
|
||||||
|
|
||||||
# Update function for slider
|
|
||||||
def update_plot(factor):
|
|
||||||
# Recalculate peaks and prominences
|
|
||||||
peaks, valleys, prominences = self.adaptive_find_peaks(smooth_prices.to_numpy(), window=window,
|
|
||||||
factor=factor, distance=distance)
|
|
||||||
print(len(peaks))
|
|
||||||
# Update scatter points for peaks
|
|
||||||
peaks_plot.set_offsets(np.column_stack([
|
|
||||||
(self.df.index[peaks] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
|
|
||||||
smooth_prices[peaks]
|
|
||||||
]))
|
|
||||||
|
|
||||||
# Update scatter points for valleys
|
|
||||||
valleys_plot.set_offsets(np.column_stack([
|
|
||||||
(self.df.index[valleys] - np.datetime64('1970-01-01')) / np.timedelta64(1, 's'),
|
|
||||||
smooth_prices[valleys]
|
|
||||||
]))
|
|
||||||
|
|
||||||
# Update prominence line
|
|
||||||
prominence_line.set_ydata(prominences)
|
|
||||||
|
|
||||||
# Update the title to reflect the current prominence factor
|
|
||||||
ax.set_title(f'Bitcoin Price Trends Analysis\nfactor={factor}')
|
|
||||||
|
|
||||||
# Redraw the figure
|
|
||||||
fig.canvas.draw_idle()
|
|
||||||
|
|
||||||
slider.on_changed(update_plot) # Update plot when slider changes
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def analyze_trends_linear_regression(self):
|
def analyze_trends_linear_regression(self):
|
||||||
|
|||||||
@ -20,5 +20,5 @@ if __name__ == "__main__":
|
|||||||
print(f"Parsed {len(html_files)} html files")
|
print(f"Parsed {len(html_files)} html files")
|
||||||
|
|
||||||
for file, content in html_files.items():
|
for file, content in html_files.items():
|
||||||
result = analyzer.classify_article(content)
|
result = analyzer.classify_article_finbert(content)
|
||||||
print(f"article [{file}] - analyzed as [{result}]\n")
|
print(f"article [{file}] - analyzed as [{result}]\n")
|
||||||
|
|||||||
@ -3,4 +3,4 @@ from bitcoin_trend_analysis import BitcoinTrendAnalysis
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ma = BitcoinTrendAnalysis(db_path='bitcoin_historical_data.db')
|
ma = BitcoinTrendAnalysis(db_path='bitcoin_historical_data.db')
|
||||||
ma.load_data()
|
ma.load_data()
|
||||||
ma.analyze_trends_peaks(distance=1)
|
ma.analyze_trends_peaks(distance=1, prominence_factor=0.1)
|
||||||
|
|||||||
1903
poetry.lock
generated
Normal file
1903
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
30
pyproject.toml
Normal file
30
pyproject.toml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
[project]
|
||||||
|
name = "cryptomarketparser"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = [
|
||||||
|
{name = "Simon Moisy",email = "simon.moisy@tutanota.com"}
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10,<4.0"
|
||||||
|
dependencies = [
|
||||||
|
"numpy (>=2.2.3,<3.0.0)",
|
||||||
|
"pandas (>=2.2.3,<3.0.0)",
|
||||||
|
"sqlalchemy (>=2.0.39,<3.0.0)",
|
||||||
|
"scipy (>=1.15.2,<2.0.0)",
|
||||||
|
"matplotlib (>=3.10.1,<4.0.0)",
|
||||||
|
"scikit-learn (>=1.6.1,<2.0.0)",
|
||||||
|
"ollama (>=0.4.7,<0.5.0)",
|
||||||
|
"transformers (>=4.49.0,<5.0.0)",
|
||||||
|
"markdownify (>=1.1.0,<2.0.0)"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[[tool.poetry.source]]
|
||||||
|
name = "pytorch"
|
||||||
|
url = "https://download.pytorch.org/whl/cu121"
|
||||||
|
priority = "explicit"
|
||||||
Loading…
x
Reference in New Issue
Block a user