import os import sys from sqlalchemy import create_engine, Column, String, Float, MetaData, Table from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from article_analyzer import ArticleAnalyzer import nltk import logging Base = declarative_base() class ArticleAnalysis(Base): __tablename__ = 'article_analyses' filename = Column(String, primary_key=True) label = Column(String) score = Column(Float) def read_html_files(folder_path): html_contents = {} for root, _, files in os.walk(folder_path): for file in files: if file.endswith(".html"): file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_contents[file_path] = f.read() return html_contents if __name__ == "__main__": # nltk.set_proxy('http://127.0.0.1:7890') # nltk.download('punkt_tab') logging.basicConfig(level=logging.CRITICAL) analyzer = ArticleAnalyzer() engine = create_engine('sqlite:///databases/article_analysis.db') Session = sessionmaker(bind=engine) session = Session() html_files = read_html_files("./data") print(f"Parsed {len(html_files)} html files") Base.metadata.create_all(engine) # result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.") # print(f'result {result}') for file, content in html_files.items(): chunk_size = 512 chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] results = [] for chunk in chunks: if chunk.strip(): chunk_result = analyzer.classify_article_finbert(chunk) results.extend(chunk_result) result = results if results else [{'label': 'neutral', 'score': 0.0}] filename = os.path.basename(file) print(f'result {result}') # label = result[0]['label'] # score = result[0]['score'] # analysis = ArticleAnalysis(filename=filename, label=label, score=score) # try: # session.add(analysis) # session.commit() # except: # session.rollback() # existing = session.query(ArticleAnalysis).filter_by(filename=filename).first() # if existing: # existing.label = label # existing.score = score # session.commit() # finally: # session.close() # print(f"article [{file}] - analyzed as [{result}]\n")