CryptoMarketParser/main_article_analyzer.py

import os
import sys
from sqlalchemy import create_engine, Column, String, Float, MetaData, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from article_analyzer import ArticleAnalyzer
import nltk
import logging

Base = declarative_base()

class ArticleAnalysis(Base):
    __tablename__ = 'article_analyses'

    filename = Column(String, primary_key=True)
    label = Column(String)
    score = Column(Float)

def read_html_files(folder_path):
    html_contents = {}
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".html"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    html_contents[file_path] = f.read()
    return html_contents


if __name__ == "__main__":
    # nltk.set_proxy('http://127.0.0.1:7890')
    # nltk.download('punkt_tab')

    logging.basicConfig(level=logging.CRITICAL)

    analyzer = ArticleAnalyzer()

    engine = create_engine('sqlite:///databases/article_analysis.db')
    Session = sessionmaker(bind=engine)
    session = Session()

    html_files = read_html_files("./data")
    print(f"Parsed {len(html_files)} html files")

    Base.metadata.create_all(engine)

    # result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.")
    # print(f'result {result}')

    for file, content in html_files.items():
        chunk_size = 512
        chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

        results = []
        for chunk in chunks:
            if chunk.strip():
                chunk_result = analyzer.classify_article_finbert(chunk)
                results.extend(chunk_result)

        result = results if results else [{'label': 'neutral', 'score': 0.0}]

        filename = os.path.basename(file)
        print(f'result {result}')

        # label = result[0]['label']
        # score = result[0]['score']

        # analysis = ArticleAnalysis(filename=filename, label=label, score=score)

        # try:
        #     session.add(analysis)
        #     session.commit()
        # except:
        #     session.rollback()

        #     existing = session.query(ArticleAnalysis).filter_by(filename=filename).first()
        #     if existing:
        #         existing.label = label
        #         existing.score = score
        #         session.commit()
        # finally:
        #     session.close()

        # print(f"article [{file}] - analyzed as [{result}]\n")