85 lines
2.6 KiB
Python
85 lines
2.6 KiB
Python
import os
|
|
import sys
|
|
from sqlalchemy import create_engine, Column, String, Float, MetaData, Table
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
from sqlalchemy.orm import sessionmaker
|
|
from article_analyzer import ArticleAnalyzer
|
|
import nltk
|
|
import logging
|
|
|
|
Base = declarative_base()
|
|
|
|
class ArticleAnalysis(Base):
|
|
__tablename__ = 'article_analyses'
|
|
|
|
filename = Column(String, primary_key=True)
|
|
label = Column(String)
|
|
score = Column(Float)
|
|
|
|
def read_html_files(folder_path):
|
|
html_contents = {}
|
|
for root, _, files in os.walk(folder_path):
|
|
for file in files:
|
|
if file.endswith(".html"):
|
|
file_path = os.path.join(root, file)
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
html_contents[file_path] = f.read()
|
|
return html_contents
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# nltk.set_proxy('http://127.0.0.1:7890')
|
|
# nltk.download('punkt_tab')
|
|
|
|
logging.basicConfig(level=logging.CRITICAL)
|
|
|
|
analyzer = ArticleAnalyzer()
|
|
|
|
engine = create_engine('sqlite:///databases/article_analysis.db')
|
|
Session = sessionmaker(bind=engine)
|
|
session = Session()
|
|
|
|
html_files = read_html_files("./data")
|
|
print(f"Parsed {len(html_files)} html files")
|
|
|
|
Base.metadata.create_all(engine)
|
|
|
|
# result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.")
|
|
# print(f'result {result}')
|
|
|
|
for file, content in html_files.items():
|
|
chunk_size = 512
|
|
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
|
|
|
|
results = []
|
|
for chunk in chunks:
|
|
if chunk.strip():
|
|
chunk_result = analyzer.classify_article_finbert(chunk)
|
|
results.extend(chunk_result)
|
|
|
|
result = results if results else [{'label': 'neutral', 'score': 0.0}]
|
|
|
|
filename = os.path.basename(file)
|
|
print(f'result {result}')
|
|
|
|
# label = result[0]['label']
|
|
# score = result[0]['score']
|
|
|
|
# analysis = ArticleAnalysis(filename=filename, label=label, score=score)
|
|
|
|
# try:
|
|
# session.add(analysis)
|
|
# session.commit()
|
|
# except:
|
|
# session.rollback()
|
|
|
|
# existing = session.query(ArticleAnalysis).filter_by(filename=filename).first()
|
|
# if existing:
|
|
# existing.label = label
|
|
# existing.score = score
|
|
# session.commit()
|
|
# finally:
|
|
# session.close()
|
|
|
|
# print(f"article [{file}] - analyzed as [{result}]\n")
|