diff --git a/Dockerfile b/Dockerfile index afada6e7f54a96ec5967a9884de348c526ec8a31..672fe70782987b8570230be9bae93dcc71ebf202 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM hub.oepkgs.net/neocopilot/data_chain_back_base:1230 +FROM hub.oepkgs.net/neocopilot/data_chain_back_base:0.9.5-x86 COPY --chown=1001:1001 --chmod=750 ./ /rag-service/ WORKDIR /rag-service diff --git a/data_chain/apps/router/knowledge_base.py b/data_chain/apps/router/knowledge_base.py index 44d9eeb45252f1870921826e8fd85b2b50553b28..2c373bb02d0ea32006570dc4db466bab17740677 100644 --- a/data_chain/apps/router/knowledge_base.py +++ b/data_chain/apps/router/knowledge_base.py @@ -221,9 +221,9 @@ async def get_stream_answer(req: QueryRequest, response: Response): tokens_upper = model_dto.max_tokens try: question = await question_rewrite(req.history, req.question, model_dto) - max_tokens = tokens_upper//3 + max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=tokens_upper//2, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=2*tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] @@ -275,9 +275,9 @@ async def get_answer(req: QueryRequest): tokens_upper = model_dto.max_tokens try: question = await question_rewrite(req.history, req.question, model_dto) - max_tokens = tokens_upper//3 + max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = tokens_upper//2, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = 2*tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py index ca79f69b9d14d625dd07158d5c8c1e79a85abeb3..c4112e14aed23991f6e05e44209c6ebeb442dcf3 100644 --- a/data_chain/parser/handler/base_parser.py +++ b/data_chain/parser/handler/base_parser.py @@ -406,10 +406,10 @@ class BaseService: """ chunks = [] - def get_edges(node, parent_id=None, dep=0): + def get_edges(node, parent_id=None, dfs_order=0): chunk = self.package_to_chunk(text=node["text"], tokens=split_tools.get_tokens(tree["text"]), status="", type_big=node["type"], type_small='tree', type_attr=node['type_attr'], - global_offset=dep, link_to=parent_id, ) + global_offset=dfs_order, link_to=parent_id, ) node['id'] = chunk['id'] chunks.append(chunk) @@ -417,7 +417,8 @@ class BaseService: if 'children' in node and node['children']: for child in node['children']: # 递归处理子节点 - get_edges(child, node['id'], dep + 1) + dfs_order = get_edges(child, node['id'], dfs_order+1) + return dfs_order get_edges(tree) diff --git a/data_chain/parser/handler/html_parser.py b/data_chain/parser/handler/html_parser.py index 5b0a0471e0111b3aca0ef3f65842b5ba552777ea..f2ec567165c05279658ea2f9b7da902df4ee7185 100644 --- a/data_chain/parser/handler/html_parser.py +++ b/data_chain/parser/handler/html_parser.py @@ -20,20 +20,22 @@ class HtmlService(BaseService): def element_to_dict(self, element): node_dict = { "tag": element.name, # 当前节点的标签名 - "attributes": element.attrs if element.attrs else None, # 标签的属性(如果有) - "text": element.get_text(strip=True) if element.string else None, # 标签内的文字 + "attributes": element.attrs if element.attrs else '', # 标签的属性(如果有) + "text": element.get_text(strip=True) if element.string else '', # 标签内的文字 "children": [], # 子节点列表 "id": self.get_uuid(), - "type": "general", + "type": "para", "type_attr": 'leaf', } # 处理图片 if element.name == "img": node_dict["img"] = element.get('src', None) + node_dict['type'] = 'img' # 处理列表 elif element.name in ["ul", "ol"]: node_dict["list"] = [li.get_text(strip=True) for li in element.find_all('li')] + node_dict['type'] = 'table' # 递归处理子元素 for child in element.children: diff --git a/requirements.txt b/requirements.txt index ff3a9096d5b764ec669af62ba29d24f628a1d9e8..feb7b70bd6fc1fa160b1d95e255e9aeb42e6e5e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,3 +43,4 @@ tika==2.6.0 tiktoken==0.8.0 urllib3==2.2.1 uvicorn==0.21.0 +xlrd==2.0.1 \ No newline at end of file