From f16d729ed6ea68934479974a9924f3bdfdbdee84 Mon Sep 17 00:00:00 2001 From: zxstty Date: Thu, 27 Mar 2025 12:41:20 +0000 Subject: [PATCH 1/5] update data_chain/apps/router/knowledge_base.py. Signed-off-by: zxstty --- data_chain/apps/router/knowledge_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_chain/apps/router/knowledge_base.py b/data_chain/apps/router/knowledge_base.py index 44d9eeb..0d8181e 100644 --- a/data_chain/apps/router/knowledge_base.py +++ b/data_chain/apps/router/knowledge_base.py @@ -221,9 +221,9 @@ async def get_stream_answer(req: QueryRequest, response: Response): tokens_upper = model_dto.max_tokens try: question = await question_rewrite(req.history, req.question, model_dto) - max_tokens = tokens_upper//3 + max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=tokens_upper//2, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] @@ -275,9 +275,9 @@ async def get_answer(req: QueryRequest): tokens_upper = model_dto.max_tokens try: question = await question_rewrite(req.history, req.question, model_dto) - max_tokens = tokens_upper//3 + max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = tokens_upper//2, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] -- Gitee From 6e4f7c2edc32fc0953967802f6bb6e675195e249 Mon Sep 17 00:00:00 2001 From: zxstty Date: Thu, 27 Mar 2025 12:50:18 +0000 Subject: [PATCH 2/5] update data_chain/apps/router/knowledge_base.py. Signed-off-by: zxstty --- data_chain/apps/router/knowledge_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_chain/apps/router/knowledge_base.py b/data_chain/apps/router/knowledge_base.py index 0d8181e..2c373bb 100644 --- a/data_chain/apps/router/knowledge_base.py +++ b/data_chain/apps/router/knowledge_base.py @@ -223,7 +223,7 @@ async def get_stream_answer(req: QueryRequest, response: Response): question = await question_rewrite(req.history, req.question, model_dto) max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=tokens_upper, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens=2*tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] @@ -277,7 +277,7 @@ async def get_answer(req: QueryRequest): question = await question_rewrite(req.history, req.question, model_dto) max_tokens = tokens_upper//3*2 bac_info = '' - document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = tokens_upper, topk=req.top_k) + document_chunk_list = await get_similar_chunks(content=question, kb_id=req.kb_sn, temporary_document_ids=req.document_ids, max_tokens = 2*tokens_upper, topk=req.top_k) for i in range(len(document_chunk_list)): document_name = document_chunk_list[i]['document_name'] chunk_list = document_chunk_list[i]['chunk_list'] -- Gitee From 90db05712ecbae42356ccc9c2bc5e623a87ae038 Mon Sep 17 00:00:00 2001 From: zxstty Date: Sat, 29 Mar 2025 16:46:21 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dxlsx=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- data_chain/parser/handler/html_parser.py | 4 ++-- data_chain/parser/handler/xlsx_parser.py | 1 + requirements.txt | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index afada6e..672fe70 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM hub.oepkgs.net/neocopilot/data_chain_back_base:1230 +FROM hub.oepkgs.net/neocopilot/data_chain_back_base:0.9.5-x86 COPY --chown=1001:1001 --chmod=750 ./ /rag-service/ WORKDIR /rag-service diff --git a/data_chain/parser/handler/html_parser.py b/data_chain/parser/handler/html_parser.py index 5b0a047..b3961d3 100644 --- a/data_chain/parser/handler/html_parser.py +++ b/data_chain/parser/handler/html_parser.py @@ -20,8 +20,8 @@ class HtmlService(BaseService): def element_to_dict(self, element): node_dict = { "tag": element.name, # 当前节点的标签名 - "attributes": element.attrs if element.attrs else None, # 标签的属性(如果有) - "text": element.get_text(strip=True) if element.string else None, # 标签内的文字 + "attributes": element.attrs if element.attrs else '', # 标签的属性(如果有) + "text": element.get_text(strip=True) if element.string else '', # 标签内的文字 "children": [], # 子节点列表 "id": self.get_uuid(), "type": "general", diff --git a/data_chain/parser/handler/xlsx_parser.py b/data_chain/parser/handler/xlsx_parser.py index 0cf72a2..5e0dde0 100644 --- a/data_chain/parser/handler/xlsx_parser.py +++ b/data_chain/parser/handler/xlsx_parser.py @@ -30,5 +30,6 @@ class XlsxService(BaseService): data = self.read_xlsx(file_path) sentences = self.extract_table(data) chunks = self.build_chunks_by_lines(sentences) + print(chunks) chunk_links = self.build_chunk_links_by_line(chunks) return chunks, chunk_links, [] diff --git a/requirements.txt b/requirements.txt index ff3a909..feb7b70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,3 +43,4 @@ tika==2.6.0 tiktoken==0.8.0 urllib3==2.2.1 uvicorn==0.21.0 +xlrd==2.0.1 \ No newline at end of file -- Gitee From 3312accf071ff9910e0fa6005e9c121075f10bf4 Mon Sep 17 00:00:00 2001 From: zxstty Date: Sat, 29 Mar 2025 17:13:22 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E5=8E=BB=E9=99=A4xlsx=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=E6=97=A0=E7=94=A8=E6=89=93=E5=8D=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/xlsx_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_chain/parser/handler/xlsx_parser.py b/data_chain/parser/handler/xlsx_parser.py index 5e0dde0..0cf72a2 100644 --- a/data_chain/parser/handler/xlsx_parser.py +++ b/data_chain/parser/handler/xlsx_parser.py @@ -30,6 +30,5 @@ class XlsxService(BaseService): data = self.read_xlsx(file_path) sentences = self.extract_table(data) chunks = self.build_chunks_by_lines(sentences) - print(chunks) chunk_links = self.build_chunk_links_by_line(chunks) return chunks, chunk_links, [] -- Gitee From 2d89ef65f7de269ec80f77b50bf1ed30e3a45100 Mon Sep 17 00:00:00 2001 From: zxstty Date: Sat, 29 Mar 2025 17:40:32 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E4=BD=BF=E7=94=A8dfs=E5=BA=8F=E4=BD=9C?= =?UTF-8?q?=E4=B8=BAhtml=E7=9A=84global=20offset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/base_parser.py | 7 ++++--- data_chain/parser/handler/html_parser.py | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py index ca79f69..c4112e1 100644 --- a/data_chain/parser/handler/base_parser.py +++ b/data_chain/parser/handler/base_parser.py @@ -406,10 +406,10 @@ class BaseService: """ chunks = [] - def get_edges(node, parent_id=None, dep=0): + def get_edges(node, parent_id=None, dfs_order=0): chunk = self.package_to_chunk(text=node["text"], tokens=split_tools.get_tokens(tree["text"]), status="", type_big=node["type"], type_small='tree', type_attr=node['type_attr'], - global_offset=dep, link_to=parent_id, ) + global_offset=dfs_order, link_to=parent_id, ) node['id'] = chunk['id'] chunks.append(chunk) @@ -417,7 +417,8 @@ class BaseService: if 'children' in node and node['children']: for child in node['children']: # 递归处理子节点 - get_edges(child, node['id'], dep + 1) + dfs_order = get_edges(child, node['id'], dfs_order+1) + return dfs_order get_edges(tree) diff --git a/data_chain/parser/handler/html_parser.py b/data_chain/parser/handler/html_parser.py index b3961d3..f2ec567 100644 --- a/data_chain/parser/handler/html_parser.py +++ b/data_chain/parser/handler/html_parser.py @@ -24,16 +24,18 @@ class HtmlService(BaseService): "text": element.get_text(strip=True) if element.string else '', # 标签内的文字 "children": [], # 子节点列表 "id": self.get_uuid(), - "type": "general", + "type": "para", "type_attr": 'leaf', } # 处理图片 if element.name == "img": node_dict["img"] = element.get('src', None) + node_dict['type'] = 'img' # 处理列表 elif element.name in ["ul", "ol"]: node_dict["list"] = [li.get_text(strip=True) for li in element.find_all('li')] + node_dict['type'] = 'table' # 递归处理子元素 for child in element.children: -- Gitee