diff --git a/data_chain/apps/app.py b/data_chain/apps/app.py index 6e69e20229680e9d2da9954c9ddaa278466788da..64fdf4efd0e2b3bc84035fb9f2615034611e2b50 100644 --- a/data_chain/apps/app.py +++ b/data_chain/apps/app.py @@ -52,7 +52,8 @@ from data_chain.rag import ( vector_searcher, keyword_and_vector_searcher, doc2chunk_searcher, - doc2chunk_bfs_searcher + doc2chunk_bfs_searcher, + enhanced_by_llm_searcher ) from data_chain.stores.database.database import DataBase, ActionEntity, DocumentTypeEntity from data_chain.manager.role_manager import RoleManager diff --git a/data_chain/apps/base/task/worker/acc_testing_worker.py b/data_chain/apps/base/task/worker/acc_testing_worker.py index 41fbfeb67430b6127c22e46f9f792022b590d127..8ae3ceb6efb3e8cfe2d27ac7b1133c43d7249cc3 100644 --- a/data_chain/apps/base/task/worker/acc_testing_worker.py +++ b/data_chain/apps/base/task/worker/acc_testing_worker.py @@ -143,6 +143,19 @@ class TestingWorker(BaseWorker): answer = qa_entity.answer chunk = qa_entity.chunk chunk_entities = await BaseSearcher.search(testing_entity.search_method, testing_entity.kb_id, question, top_k=testing_entity.top_k, doc_ids=None, banned_ids=[]) + related_chunk_entities = [] + banned_ids = [chunk_entity.id for chunk_entity in chunk_entities] + divide_tokens = llm.max_tokens // len(chunk_entities) if chunk_entities else llm.max_tokens + leave_tokens = 0 + for chunk_entity in chunk_entities: + leave_tokens += divide_tokens - chunk_entity.tokens + sub_related_chunk_entities = await BaseSearcher.related_surround_chunk(chunk_entity, leave_tokens, banned_ids) + banned_ids += [sub_chunk_entity.id for sub_chunk_entity in sub_related_chunk_entities] + related_chunk_entities += sub_related_chunk_entities + for related_chunk_entity in sub_related_chunk_entities: + leave_tokens -= related_chunk_entity.tokens + leave_tokens = max(leave_tokens, 0) + chunk_entities += related_chunk_entities doc_chunk_dict = {} for chunk_entity in chunk_entities: if chunk_entity.doc_id not in doc_chunk_dict: diff --git a/data_chain/apps/base/task/worker/parse_document_worker.py b/data_chain/apps/base/task/worker/parse_document_worker.py index 32bc4e0bd24dd5f42bb40c54c4eba7a8d403c0fc..7b9a75ef4727fde404b1dcada2f6b554aeff48c9 100644 --- a/data_chain/apps/base/task/worker/parse_document_worker.py +++ b/data_chain/apps/base/task/worker/parse_document_worker.py @@ -289,6 +289,8 @@ class ParseDocumentWorker(BaseWorker): if len(nodes) == 0 or (len(nodes) and (nodes[-1].type != ChunkType.TEXT or TokenTool.get_tokens(nodes[-1].content) + tokens > doc_entity.chunk_size)): nodes.append(node) else: + if node.is_need_newline: + nodes[-1].content += '\n' nodes[-1].content += node.content else: nodes.append(node) diff --git a/data_chain/apps/service/chunk_service.py b/data_chain/apps/service/chunk_service.py index 4e3d2e976b4f75589f072cc3d7e3dead12d04e4a..5c7fd21d52d0a2653b20df1083fd0b679a26fc39 100644 --- a/data_chain/apps/service/chunk_service.py +++ b/data_chain/apps/service/chunk_service.py @@ -101,10 +101,11 @@ class ChunkService: chunk_entities = await BaseSearcher.rerank(chunk_entities, req.query) chunk_entities = chunk_entities[:req.top_k] chunk_ids = [chunk_entity.id for chunk_entity in chunk_entities] + logging.error("[ChunkService] 搜索分片,查询结果数量: %s", len(chunk_entities)) if req.is_related_surrounding: # 关联上下文 tokens_limit = req.tokens_limit - tokens_limit_every_chunk = tokens_limit // len(chunk_entities) + tokens_limit_every_chunk = tokens_limit // len(chunk_entities) if len(chunk_entities) > 0 else tokens_limit leave_tokens = 0 related_chunk_entities = [] for chunk_entity in chunk_entities: @@ -125,6 +126,7 @@ class ChunkService: chunk_ids += [chunk_entity.id for chunk_entity in sub_related_chunk_entities] related_chunk_entities += sub_related_chunk_entities chunk_entities += related_chunk_entities + logging.error(len(chunk_entities)) search_chunk_msg = SearchChunkMsg(docChunks=[]) if req.is_classify_by_doc: doc_chunks = await BaseSearcher.classify_by_doc_id(chunk_entities) diff --git a/data_chain/parser/handler/pdf_parser.py b/data_chain/parser/handler/pdf_parser.py index fcc93e0564beaa86bdb3fa5cf6c7c784969eb458..65248a01a3382f5c9a53350c114734d8fa302717 100644 --- a/data_chain/parser/handler/pdf_parser.py +++ b/data_chain/parser/handler/pdf_parser.py @@ -151,7 +151,44 @@ class PdfParser(BaseParser): xref = image_info[0] # 提取基础图片(如果存在) base_image = pdf_doc.extract_image(xref) - position = page.get_image_rects(xref)[0] + + # 检查提取的图片是否有效 + if not base_image or "image" not in base_image: + logging.warning("[PdfParser] 标准方法提取失败,尝试替代方法 xref=%s", xref) + continue + + # 检查位置信息 + rects = page.get_image_rects(xref) + if not rects: + logging.warning("[PdfParser] 找不到图片位置,尝试基于布局估算 xref=%s", xref) + width, height = base_image.get("width", 0), base_image.get("height", 0) + if width <= 0 or height <= 0: + logging.warning("[PdfParser] 图片尺寸无效,跳过 xref=%s", xref) + continue + # 获取页面尺寸 + page_width, page_height = page.rect.width, page.rect.height + + # 方法1: 默认居中布局 + x0 = (page_width - width) / 2 + y0 = (page_height - height) / 2 + + # 方法2: 考虑文本布局,假设图片在页面上半部分 + # 这里可以集成文本布局分析,例如获取页面上的文本块位置 + # 然后避免与文本重叠 + + # 方法3: 基于图片大小的智能布局 + # 如果图片很大,可能是全页图片,位置应从(0,0)开始 + if width > page_width * 0.8 and height > page_height * 0.8: + x0, y0 = 0, 0 + # 如果图片很小,可能是图标或装饰,可能在角落 + elif width < page_width * 0.2 and height < page_height * 0.2: + # 放在右上角作为默认位置 + x0 = page_width - width - 10 # 留出边距 + y0 = 10 # 留出边距 + + position = fitz.Rect(x0, y0, x0 + width, y0 + height) + else: + position = rects[0] # 获取图片的二进制数据 blob = base_image["image"] @@ -262,6 +299,10 @@ class PdfParser(BaseParser): sub_nodes_with_bbox, image_nodes_with_bbox) nodes_with_bbox.extend(sub_nodes_with_bbox) + for i in range(1, len(nodes_with_bbox)): + '''根据bbox判断是否要进行换行''' + if nodes_with_bbox[i].bbox.y0 > nodes_with_bbox[i-1].bbox.y1 + 1: + nodes_with_bbox[i].node.is_need_newline = True nodes = [node_with_bbox.node for node_with_bbox in nodes_with_bbox] PdfParser.image_related_node_in_link_nodes(nodes) # 假设这个方法在别处定义 diff --git a/data_chain/parser/handler/xlsx_parser.py b/data_chain/parser/handler/xlsx_parser.py index ba1facbbf1712c9d03cf3be447c74f31da6c2cd0..b261f71b09cccc0b226c11dc72bbce1f627f74bb 100644 --- a/data_chain/parser/handler/xlsx_parser.py +++ b/data_chain/parser/handler/xlsx_parser.py @@ -32,15 +32,37 @@ class XlsxParser(BaseParser): @staticmethod async def parser(file_path: str) -> ParseResult: - try: - if file_path.endswith(('.xlsx', '.xls')): + if file_path.endswith(('.xlsx', '.xls')): + try: data = pd.read_excel(file_path, sheet_name=None, header=None) - elif file_path.endswith('.csv'): + except Exception as e: + err = f"[XlsxParser] 解析Excel文件失败,error: {e}" + logging.exception(err) + raise e + elif file_path.endswith('.csv'): + try: data = pd.read_csv(file_path, header=None) - except Exception as e: - err = "读取xlsx文件失败" - logging.exception("[XlsxParser] %s", err) - raise e + except Exception as e: + err = f"[XlsxParser] 解析CSV文件失败,error: {e}" + logging.exception(err) + raise e + else: + data = None + try: + data = pd.read_excel(file_path, sheet_name=None, header=None) + except Exception as e: + err = f"[XlsxParser] 解析文件失败,error: {e}" + logging.exception(err) + try: + data = pd.read_csv(file_path, header=None) + except Exception as e: + err = f"[XlsxParser] 解析文件失败,error: {e}" + logging.exception(err) + if data is None: + err = f"[XlsxParser] 无法解析文件,file_path: {file_path}" + logging.exception(err) + raise Exception(err) + nodes = [] for sheet_name, df in data.items(): table_array = await XlsxParser.extract_table_to_array(df) diff --git a/data_chain/parser/parse_result.py b/data_chain/parser/parse_result.py index b701c0ff528c1291c5ecb0cd17835467382aff53..24dee22c1241748eb9a3c8d1f96944567ebb81b3 100644 --- a/data_chain/parser/parse_result.py +++ b/data_chain/parser/parse_result.py @@ -18,6 +18,7 @@ class ParseNode(BaseModel): content: Any = Field(..., description="节点内容") type: ChunkType = Field(..., description="节点类型") link_nodes: list = Field(..., description="链接节点") + is_need_newline: bool = Field(default=False, description="是否需要换行") class ParseResult(BaseModel): diff --git a/data_chain/parser/tools/ocr_tool.py b/data_chain/parser/tools/ocr_tool.py index 11739632af28721abdc0521a951cb06f2a3b2633..bfc7aab6f3415e73335625a127f27d4cf520b775 100644 --- a/data_chain/parser/tools/ocr_tool.py +++ b/data_chain/parser/tools/ocr_tool.py @@ -1,5 +1,4 @@ -from PIL import Image -import asyncio +from PIL import Image, ImageEnhance import yaml from paddleocr import PaddleOCR import numpy as np @@ -13,23 +12,47 @@ class OcrTool: det_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_det_infer' rec_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_rec_infer' cls_model_dir = 'data_chain/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer' + # 优化 OCR 参数配置 model = PaddleOCR( det_model_dir=det_model_dir, rec_model_dir=rec_model_dir, cls_model_dir=cls_model_dir, - use_angle_cls=True, # 是否使用角度分类模型 - use_space_char=True # 是否使用空格字符 + use_angle_cls=True, + use_space_char=True, + det_db_thresh=0.3, # 降低文本检测阈值,提高敏感度 + det_db_box_thresh=0.5, # 调整文本框阈值 ) @staticmethod async def ocr_from_image(image: np.ndarray) -> list: try: + + # 尝试OCR识别 ocr_result = OcrTool.model.ocr(image) - if ocr_result is None or ocr_result[0] is None: + + # 如果第一次尝试失败,尝试不同的参数配置 + if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None: + logging.warning("[OCRTool] 第一次OCR尝试失败,尝试降低阈值...") + # 创建临时OCR实例,使用更低的阈值 + temp_ocr = PaddleOCR( + det_model_dir=OcrTool.det_model_dir, + rec_model_dir=OcrTool.rec_model_dir, + cls_model_dir=OcrTool.cls_model_dir, + use_angle_cls=True, + use_space_char=True, + det_db_thresh=0.2, # 更低的检测阈值 + det_db_box_thresh=0.4, # 更低的文本框阈值 + ) + ocr_result = temp_ocr.ocr(image) + + # 记录OCR结果状态 + if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None: + logging.warning("[OCRTool] 图片无法识别文本") return None + return ocr_result except Exception as e: - err = f"[OCRTool] OCR识别失败 {e}" + err = f"[OCRTool] OCR识别失败: {e}" logging.exception(err) return None diff --git a/data_chain/rag/base_searcher.py b/data_chain/rag/base_searcher.py index 72e65a9d0063bff652e629ed86447cb6e8755164..ef96e0f61ca6d1f2b5a1e65937d3426638eb62bb 100644 --- a/data_chain/rag/base_searcher.py +++ b/data_chain/rag/base_searcher.py @@ -65,7 +65,7 @@ class BaseSearcher: :param query: 查询 :return: 相关上下文 """ - chunk_entities = await ChunkManager.fetch_surrounding_chunk_by_doc_id_and_global_offset(chunk_entity.doc_id, chunk_entity.global_offset, 50, banned_ids) + chunk_entities = await ChunkManager.fetch_surrounding_chunk_by_doc_id_and_global_offset(chunk_entity.doc_id, chunk_entity.global_offset, 100, banned_ids) chunk_entity_dict = {} lower = chunk_entity.global_offset-1 upper = chunk_entity.global_offset+1 diff --git a/data_chain/rag/doc2chunk_bfs_searcher.py b/data_chain/rag/doc2chunk_bfs_searcher.py index d30f21b9225b13c8e8e257c8d8ae27f66d2fa926..c72e7bd1c1a05b2c9ea68e027dac522714871210 100644 --- a/data_chain/rag/doc2chunk_bfs_searcher.py +++ b/data_chain/rag/doc2chunk_bfs_searcher.py @@ -1,3 +1,4 @@ +import asyncio import uuid from pydantic import BaseModel, Field import random @@ -33,7 +34,15 @@ class Doc2ChunkBfsSearcher(BaseSearcher): try: root_chunk_entities_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k//2, doc_ids, banned_ids, ChunkParseTopology.TREEROOT.value) banned_ids += [chunk_entity.id for chunk_entity in root_chunk_entities_keyword] - root_chunk_entities_vector = await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, ChunkParseTopology.TREEROOT.value) + root_chunk_entities_vector = [] + for _ in range(3): + try: + root_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, ChunkParseTopology.TREEROOT.value), timeout=3) + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue banned_ids += [chunk_entity.id for chunk_entity in root_chunk_entities_vector] chunk_entities = root_chunk_entities_keyword + root_chunk_entities_vector pre_ids = [chunk_entity.id for chunk_entity in chunk_entities] @@ -42,13 +51,21 @@ class Doc2ChunkBfsSearcher(BaseSearcher): while rd < max_retry: root_chunk_entities_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k//2, doc_ids, banned_ids, None, pre_ids) banned_ids += [chunk_entity.id for chunk_entity in root_chunk_entities_keyword] - root_chunk_entities_vector = await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, None, pre_ids) + root_chunk_entities_vector = [] + for _ in range(3): + try: + root_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, None, pre_ids), timeout=3) + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue banned_ids += [chunk_entity.id for chunk_entity in root_chunk_entities_vector] sub_chunk_entities = root_chunk_entities_keyword + root_chunk_entities_vector if len(sub_chunk_entities) == 0: break chunk_entities += sub_chunk_entities - pre_ids += [chunk_entity.id for chunk_entity in sub_chunk_entities] + pre_ids = [chunk_entity.id for chunk_entity in sub_chunk_entities] rd += 1 except Exception as e: err = f"[KeywordVectorSearcher] 关键词向量检索失败,error: {e}" diff --git a/data_chain/rag/doc2chunk_searcher.py b/data_chain/rag/doc2chunk_searcher.py index 437670289ac03e2abbfdd55e495f4a04058b4758..fc438a0b11b3632bd14a6d3b48b86d2031f60985 100644 --- a/data_chain/rag/doc2chunk_searcher.py +++ b/data_chain/rag/doc2chunk_searcher.py @@ -1,3 +1,4 @@ +import asyncio import uuid from pydantic import BaseModel, Field import random @@ -32,11 +33,27 @@ class Doc2ChunkSearcher(BaseSearcher): try: doc_entities_keyword = await DocumentManager.get_top_k_document_by_kb_id_keyword(kb_id, query, top_k, doc_ids, banned_ids) use_doc_ids = [doc_entity.id for doc_entity in doc_entities_keyword] - doc_entities_vector = await DocumentManager.get_top_k_document_by_kb_id_vector(kb_id, vector, top_k-len(doc_entities_keyword), doc_ids, banned_ids) + doc_entities_vector = [] + for _ in range(3): + try: + doc_entities_vector = await asyncio.wait_for(DocumentManager.get_top_k_document_by_kb_id_vector(kb_id, vector, top_k-len(doc_entities_keyword), doc_ids, banned_ids), timeout=3) + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue use_doc_ids += [doc_entity.id for doc_entity in doc_entities_vector] chunk_entities_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k//2, use_doc_ids, banned_ids) chunk_ids = [chunk_entity.id for chunk_entity in chunk_entities_keyword] - chunk_entities_vector += await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_keyword), use_doc_ids, banned_ids+chunk_ids) + chunk_entities_vector = [] + for _ in range(3): + try: + chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_keyword), use_doc_ids, banned_ids+chunk_ids), timeout=3) + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue chunk_entities = chunk_entities_keyword + chunk_entities_vector except Exception as e: err = f"[KeywordVectorSearcher] 关键词向量检索失败,error: {e}" diff --git a/data_chain/rag/enhanced_by_llm_searcher.py b/data_chain/rag/enhanced_by_llm_searcher.py index aab7c04fca902b2558b943c196a8da4dfd26790d..9ac2dd4ad844d20267e29d770e29f094a8ff2e23 100644 --- a/data_chain/rag/enhanced_by_llm_searcher.py +++ b/data_chain/rag/enhanced_by_llm_searcher.py @@ -1,3 +1,4 @@ +import asyncio import uuid import yaml from pydantic import BaseModel, Field @@ -38,7 +39,7 @@ class EnhancedByLLMSearcher(BaseSearcher): prompt_template = prompt_dict['CHUNK_QUERY_MATCH_PROMPT'] chunk_entities = [] rd = 0 - max_retry = 5 + max_retry = 15 llm = LLM( openai_api_key=config['OPENAI_API_KEY'], openai_api_base=config['OPENAI_API_BASE'], @@ -47,18 +48,33 @@ class EnhancedByLLMSearcher(BaseSearcher): ) while len(chunk_entities) < top_k and rd < max_retry: rd += 1 - sub_chunk_entities = await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids) + sub_chunk_entities_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k, doc_ids, banned_ids) + chunk_ids = [chunk_entity.id for chunk_entity in sub_chunk_entities_keyword] + banned_ids += chunk_ids + sub_chunk_entities_vector = [] + for _ in range(3): + try: + sub_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=3) + break + except Exception as e: + err = f"[EnhancedByLLMSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue + chunk_ids = [chunk_entity.id for chunk_entity in sub_chunk_entities_vector] + banned_ids += chunk_ids + sub_chunk_entities = sub_chunk_entities_keyword + sub_chunk_entities_vector for chunk_entity in sub_chunk_entities: sys_call = prompt_template.format( - chunk=chunk_entity.text, - query=query, + chunk=TokenTool.get_k_tokens_words_from_content(chunk_entity.text, llm.max_tokens), + question=query, ) user_call = "请输出YES或NO" result = await llm.nostream([], sys_call, user_call) result = result.lower() if result == "yes": chunk_entities.append(chunk_entity) - chunk_ids = [chunk_entity.id for chunk_entity in sub_chunk_entities] + logging.info( + f"[EnhancedByLLMSearcher] 匹配到分片: {chunk_entity.id}, 分片内容: {chunk_entity.text[:50]}...") banned_ids += chunk_ids return chunk_entities[:top_k] except Exception as e: diff --git a/data_chain/rag/keyword_and_vector_searcher.py b/data_chain/rag/keyword_and_vector_searcher.py index 18cfdc6e7dfd161e4706ced7025d5f688644bc8b..d3753029c57c1ae2ae0a9f53632c0f543e445856 100644 --- a/data_chain/rag/keyword_and_vector_searcher.py +++ b/data_chain/rag/keyword_and_vector_searcher.py @@ -1,3 +1,4 @@ +import asyncio import uuid from pydantic import BaseModel, Field import random @@ -31,7 +32,15 @@ class KeywordVectorSearcher(BaseSearcher): try: chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k//2, doc_ids, banned_ids) chunk_ids = [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] - chunk_entities_get_by_vector = await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword), doc_ids, banned_ids+chunk_ids) + chunk_entities_get_by_vector = [] + for _ in range(3): + try: + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword), doc_ids, banned_ids+chunk_ids), timeout=3) + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue chunk_entities = chunk_entities_get_by_keyword + chunk_entities_get_by_vector except Exception as e: err = f"[KeywordVectorSearcher] 关键词向量检索失败,error: {e}" diff --git a/data_chain/rag/vector_searcher.py b/data_chain/rag/vector_searcher.py index b55ab43892d3fc6dc8f64cd52487d91bb12e9a58..dad5e8676792927fa28f27a0ec9b8ac0cb08a079 100644 --- a/data_chain/rag/vector_searcher.py +++ b/data_chain/rag/vector_searcher.py @@ -1,3 +1,4 @@ +import asyncio import uuid from pydantic import BaseModel, Field import random @@ -25,10 +26,13 @@ class VectorSearcher(BaseSearcher): :return: 检索结果 """ vector = await Embedding.vectorize_embedding(query) - try: - chunk_entities = await ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids) - except Exception as e: - err = f"[VectorSearcher] 向量检索失败,error: {e}" - logging.exception(err) - return [] + chunk_entities = [] + for _ in range(3): + try: + chunk_entities = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=3) + break + except Exception as e: + err = f"[VectorSearcher] 向量检索失败,error: {e}" + logging.exception(err) + continue return chunk_entities