From a1e3264c444dd7edd4777ac856b165901e7d06f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E5=8D=9A?= <1016318004@qq.com> Date: Thu, 6 Mar 2025 17:41:47 +0800 Subject: [PATCH] =?UTF-8?q?fix(data-chain):=20=E8=BF=87=E6=BB=A4=E7=A9=BA?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在解析文档时,添加了过滤空文本块的逻辑 - 对于临时文档和非临时文档都进行了空文本块的过滤处理 - 优化了数据处理,提高了数据准确性和后续处理效率 --- data_chain/apps/base/task/document_task_handler.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/data_chain/apps/base/task/document_task_handler.py b/data_chain/apps/base/task/document_task_handler.py index 88a964f..ac72daf 100644 --- a/data_chain/apps/base/task/document_task_handler.py +++ b/data_chain/apps/base/task/document_task_handler.py @@ -59,6 +59,11 @@ class DocumentTaskHandler(): answer = await parser.parser(document_entity.id, file_path) chunk_list, chunk_link_list, images = answer['chunk_list'], answer['chunk_link_list'], answer[ 'image_chunks'] + new_chunk_list=[] + for chunk in chunk_list: + if len(chunk['text'].strip())!=0: + new_chunk_list.append(chunk) + chunk_list = new_chunk_list chunk_id_set = set() for chunk in chunk_list: chunk_id_set.add(chunk['id']) @@ -125,6 +130,8 @@ class DocumentTaskHandler(): await TaskManager.update(task_entity.id, {'status': TaskConstant.TASK_STATUS_SUCCESS}) TaskRedisHandler.put_task_by_tail(config['REDIS_SUCCESS_TASK_QUEUE_NAME'], str(task_entity.id)) except Exception as e: + import traceback + print(traceback.format_exc()) TaskRedisHandler.put_task_by_tail(config['REDIS_RESTART_TASK_QUEUE_NAME'], str(task_entity.id)) await TaskStatusReportManager.insert(TaskStatusReportEntity( task_id=task_entity.id, @@ -167,6 +174,11 @@ class DocumentTaskHandler(): parser_result = await parser.parser(document_entity.id, file_path, is_temporary_document=True) chunk_list, chunk_link_list, images = parser_result['chunk_list'], parser_result['chunk_link_list'], parser_result[ 'image_chunks'] + new_chunk_list=[] + for chunk in chunk_list: + if len(chunk['text'].strip())!=0: + new_chunk_list.append(chunk) + chunk_list = new_chunk_list await TaskStatusReportManager.insert(TaskStatusReportEntity( task_id=task_entity.id, message=f'Parse temporary document {document_entity.name} completed, waiting for uploading', -- Gitee