CryptoMarketParser/main_article_analyzer.py

77 lines
2.3 KiB
Python
Raw Normal View History

import os
2025-03-22 05:03:54 +08:00
import sys
2025-03-21 17:39:30 +08:00
from sqlalchemy import create_engine, Column, String, Float, MetaData, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from article_analyzer import ArticleAnalyzer
import nltk
import logging
2025-03-22 05:03:54 +08:00
2025-03-21 17:39:30 +08:00
Base = declarative_base()
class ArticleAnalysis(Base):
__tablename__ = 'article_analyses'
filename = Column(String, primary_key=True)
label = Column(String)
score = Column(Float)
def read_html_files(folder_path):
html_contents = {}
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith(".html"):
file_path = os.path.join(root, file)
with open(file_path, "r", encoding="utf-8") as f:
html_contents[file_path] = f.read()
return html_contents
if __name__ == "__main__":
# nltk.set_proxy('http://127.0.0.1:7890')
# nltk.download('punkt_tab')
logging.basicConfig(level=logging.CRITICAL)
analyzer = ArticleAnalyzer()
2025-03-21 17:39:30 +08:00
engine = create_engine('sqlite:///article_analysis.db')
Session = sessionmaker(bind=engine)
session = Session()
html_files = read_html_files("./data")
print(f"Parsed {len(html_files)} html files")
2025-03-21 17:39:30 +08:00
Base.metadata.create_all(engine)
2025-03-22 05:03:54 +08:00
result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.")
print(f'result {result}')
2025-03-21 17:39:30 +08:00
# for file, content in html_files.items():
# result = analyzer.classify_article_finbert(content)
# filename = os.path.basename(file)
# print(f'result {result}')
# label = result[0]['label']
# score = result[0]['score']
# analysis = ArticleAnalysis(filename=filename, label=label, score=score)
2025-03-21 17:39:30 +08:00
# try:
# session.add(analysis)
# session.commit()
# except:
# session.rollback()
# existing = session.query(ArticleAnalysis).filter_by(filename=filename).first()
# if existing:
# existing.label = label
# existing.score = score
# session.commit()
# finally:
# session.close()
# print(f"article [{file}] - analyzed as [{result}]\n")