From 68e376ef32c02c988bef6f9f45587e02e7ec6bdc Mon Sep 17 00:00:00 2001 From: Simon Moisy Date: Fri, 21 Mar 2025 17:39:30 +0800 Subject: [PATCH] added storage to db --- main_article_analyzer.py | 44 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/main_article_analyzer.py b/main_article_analyzer.py index 899c56a..8f2eb6d 100644 --- a/main_article_analyzer.py +++ b/main_article_analyzer.py @@ -1,7 +1,18 @@ import os - +from sqlalchemy import create_engine, Column, String, Float, MetaData, Table +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker from article_analyzer import ArticleAnalyzer +Base = declarative_base() + +class ArticleAnalysis(Base): + __tablename__ = 'article_analyses' + + filename = Column(String, primary_key=True) + label = Column(String) + score = Column(Float) + def read_html_files(folder_path): html_contents = {} for root, _, files in os.walk(folder_path): @@ -16,9 +27,38 @@ def read_html_files(folder_path): if __name__ == "__main__": analyzer = ArticleAnalyzer() + engine = create_engine('sqlite:///article_analysis.db') + Session = sessionmaker(bind=engine) + session = Session() + html_files = read_html_files("./data") print(f"Parsed {len(html_files)} html files") - + + Base.metadata.create_all(engine) + + for file, content in html_files.items(): result = analyzer.classify_article_finbert(content) + + filename = os.path.basename(file) + + label = result[0]['label'] + score = result[0]['score'] + + analysis = ArticleAnalysis(filename=filename, label=label, score=score) + + try: + session.add(analysis) + session.commit() + except: + session.rollback() + + existing = session.query(ArticleAnalysis).filter_by(filename=filename).first() + if existing: + existing.label = label + existing.score = score + session.commit() + finally: + session.close() + print(f"article [{file}] - analyzed as [{result}]\n")