From 679f1bd941b123e2cfa4a1129226cc1e40288384 Mon Sep 17 00:00:00 2001 From: Simon Moisy Date: Sat, 22 Mar 2025 08:00:58 +0800 Subject: [PATCH] chunked content to 512 --- main_article_analyzer.py | 60 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/main_article_analyzer.py b/main_article_analyzer.py index 36c8f50..3efb066 100644 --- a/main_article_analyzer.py +++ b/main_article_analyzer.py @@ -44,33 +44,41 @@ if __name__ == "__main__": Base.metadata.create_all(engine) - result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.") - print(f'result {result}') + # result = analyzer.classify_article_finbert("Strong earning growth and expending market shares have positionned the company for long term success.") + # print(f'result {result}') - - # for file, content in html_files.items(): - # result = analyzer.classify_article_finbert(content) - - # filename = os.path.basename(file) - # print(f'result {result}') - - # label = result[0]['label'] - # score = result[0]['score'] - - # analysis = ArticleAnalysis(filename=filename, label=label, score=score) + for file, content in html_files.items(): + chunk_size = 512 + chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] - # try: - # session.add(analysis) - # session.commit() - # except: - # session.rollback() + results = [] + for chunk in chunks: + if chunk.strip(): + chunk_result = analyzer.classify_article_finbert(chunk) + results.extend(chunk_result) + + result = results if results else [{'label': 'neutral', 'score': 0.0}] - # existing = session.query(ArticleAnalysis).filter_by(filename=filename).first() - # if existing: - # existing.label = label - # existing.score = score - # session.commit() - # finally: - # session.close() + filename = os.path.basename(file) + print(f'result {result}') - # print(f"article [{file}] - analyzed as [{result}]\n") + # label = result[0]['label'] + # score = result[0]['score'] + + # analysis = ArticleAnalysis(filename=filename, label=label, score=score) + + # try: + # session.add(analysis) + # session.commit() + # except: + # session.rollback() + + # existing = session.query(ArticleAnalysis).filter_by(filename=filename).first() + # if existing: + # existing.label = label + # existing.score = score + # session.commit() + # finally: + # session.close() + + # print(f"article [{file}] - analyzed as [{result}]\n")