From 991ccf8f462776540d17e5b184efe0cce8731824 Mon Sep 17 00:00:00 2001 From: zxstty Date: Wed, 5 Mar 2025 11:47:38 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pptx=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/base_parser.py | 92 ++++++++++++- data_chain/parser/handler/docx_parser.py | 138 ++++++-------------- data_chain/parser/handler/pptx_parser.py | 84 ++++++++++++ data_chain/parser/service/parser_service.py | 4 +- data_chain/parser/tools/ocr.py | 4 +- data_chain/parser/tools/split.py | 4 +- requirements.txt | 3 +- utils/parser/handler/docx_parser.py | 6 +- 8 files changed, 224 insertions(+), 111 deletions(-) create mode 100644 data_chain/parser/handler/pptx_parser.py diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py index 6de4b39b..d465f525 100644 --- a/data_chain/parser/handler/base_parser.py +++ b/data_chain/parser/handler/base_parser.py @@ -1,13 +1,14 @@ import os import uuid import json + +import pptx.table from data_chain.logger.logger import logger as logging from pandas import DataFrame from docx.table import Table as DocxTable +import pptx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity -import secrets -import shutil from data_chain.manager.document_manager import DocumentManager from data_chain.manager.model_manager import ModelManager from data_chain.parser.tools.split import split_tools @@ -28,6 +29,7 @@ class BaseService: self.llm_max_tokens = None self.llm = None self.tokens = None + self.ocr_tool = None async def init_service(self, llm_entity, tokens, parser_method): self.parser_method = parser_method @@ -38,9 +40,9 @@ class BaseService: openai_api_key=Security.decrypt( llm_entity.encrypted_openai_api_key, json.loads(llm_entity.encrypted_config) - ), - max_tokens=llm_entity.max_tokens, - ) + ), + max_tokens=llm_entity.max_tokens, + ) self.llm_max_tokens = llm_entity.max_tokens self.tokens = tokens self.vectorizer = TfidfVectorizer() @@ -137,7 +139,7 @@ class BaseService: row_string_list = [s.replace('|', '||') for s in row.astype(str).tolist()] cell_num = max(cell_num, len(row_string_list)) new_table.append(row_string_list) - elif isinstance(table, DocxTable): + elif isinstance(table, DocxTable) or isinstance(table, pptx.table.Table): if table.rows: for row in table.rows: row_string_list = [s.replace('|', '||') for s in (cell.text.strip() for cell in row.cells)] @@ -167,6 +169,84 @@ class BaseService: return result + async def ocr_from_images_in_lines(self, lines): + # 获取图像相邻文本 + last_para_pre = "" + for i in range(len(lines)): + line = lines[i] + if line['type'] == 'image': + lines[i]['related_text'] = last_para_pre + elif line['type'] == 'para': + last_para_pre = line['text'] + elif line['type'] == 'table': + pass + last_para_bac = "" + for i in range(len(lines) - 1, -1, -1): + line = lines[i] + if line['type'] == 'image': + lines[i]['related_text'] += last_para_bac + elif line['type'] == 'para': + last_para_bac = line['text'] + elif line['type'] == 'table': + pass + for line in lines: + if line['type'] == 'image': + line['text'] = await self.ocr_tool.image_to_text(line['image'], text=line['related_text']) + return lines + + async def change_lines(self, lines): + """ + 修整处理lines,根据不同的类型(图像、段落、表格)处理每一行,并根据method参数决定处理方式。 + + 参数: + - lines (list): 需要处理的行列表,每行包含内容和类型。 + 返回: + - tuple: 包含处理后的句子列表和图像列表的元组。 + """ + new_lines = [] + images = [] + last_para_id = None + for line in lines: + if line['type'] == 'image': + # 处理图像 + image_id = self.get_uuid() + image = line['image'] + image_bytes = image.tobytes() + image_extension = line['extension'] + await self.insert_image_to_tmp_folder(image_bytes, image_id, image_extension) + if self.parser_method in ['ocr', 'enhanced']: + # 将图片关联到图片的描述chunk上 + chunk_id = self.get_uuid() + new_lines.append({'id': chunk_id, + 'type': 'image'}) + new_lines[-1]['image'] = np.array(image) + images.append({ + 'id': image_id, + 'chunk_id': chunk_id, + 'extension': image_extension, + }) + else: + # 将图片关联到上一个段落chunk上 + images.append({ + 'id': image_id, + 'chunk_id': last_para_id, + 'extension': image_extension, + }) + + elif line['type'] == 'para': + # 处理段落 + new_lines.append({'id': self.get_uuid(), + 'text': line['text'], + 'type': line['type']}) + last_para_id = new_lines[-1]['id'] + + elif line[1] == 'table': + # 处理表格 + new_lines.append({'id': self.get_uuid(), + 'text': line['text'], + 'type': line['type']}) + return new_lines, images + def package_to_chunk(self, **kwargs): """ 整合成chunk diff --git a/data_chain/parser/handler/docx_parser.py b/data_chain/parser/handler/docx_parser.py index 120b9931..74ebcbaf 100644 --- a/data_chain/parser/handler/docx_parser.py +++ b/data_chain/parser/handler/docx_parser.py @@ -1,10 +1,7 @@ -from data_chain.logger.logger import logger as logging import docx -import uuid from io import BytesIO from PIL import Image import numpy as np -import docx from docx.document import Document from docx.text.paragraph import Paragraph from docx.parts.image import ImagePart @@ -15,12 +12,12 @@ from docx.oxml.shape import CT_Picture import mimetypes from data_chain.parser.handler.base_parser import BaseService from data_chain.parser.tools.ocr import BaseOCR +from data_chain.logger.logger import logger as logging class DocxService(BaseService): def __init__(self): super().__init__() - self.ocr_tool = None def open_file(self, file_path): try: @@ -73,29 +70,56 @@ class DocxService(BaseService): image_parts = self.get_imageparts_from_run(run, parent) if image_parts: if text_part: - lines.append((text_part, 'para')) + lines.append( + { + 'text': text_part, + 'type': 'text' + } + ) text_part = '' for image_part in image_parts: try: image_blob = image_part.image.blob content_type = image_part.content_type except Exception as e: - logging.error(f"Image blob and part get failed due to :{e}") + logging.error(f"Get Image blob and part failed due to :{e}") + continue extension = mimetypes.guess_extension(content_type).replace('.', '') - lines.append(((Image.open(BytesIO(image_blob)), extension), 'image')) + lines.append( + { + 'image': Image.open(BytesIO(image_blob)), + 'extension': extension, + 'type': 'image' + } + ) else: text_part += run.text run_index += 1 if text_part: - lines.append((text_part, 'para')) + lines.append( + { + 'text': text_part, + 'type': 'text' + } + ) else: - lines.append((paragraph.text, 'para')) + lines.append( + { + 'text': paragraph.text, + 'type': 'text' + } + ) elif isinstance(child, CT_Tbl): table = Table(child, parent) rows = self.split_table(table) for row in rows: - lines.append((row, 'table')) + lines.append( + { + 'text': row, + 'type': 'table' + } + ) elif isinstance(child, CT_Picture): img_id = child.xpath('.//a:blip/@r:embed')[0] part = parent.part.related_parts[img_id] @@ -107,93 +131,15 @@ class DocxService(BaseService): logging.error(f'Get image blob and content type failed due to: {e}') continue extension = mimetypes.guess_extension(content_type).replace('.', '') - lines.append(((Image.open(BytesIO(image_blob)), extension), 'image')) - return lines - - async def ocr_from_images_in_lines(self, lines): - # 获取图像相邻文本 - last_para_pre = "" - for i in range(len(lines)): - line = lines[i] - if line['type'] == 'image': - lines[i]['related_text'] = last_para_pre - elif line['type'] == 'para': - last_para_pre = line['text'] - elif line['type'] == 'table': - pass - last_para_bac = "" - for i in range(len(lines) - 1, -1, -1): - line = lines[i] - if line['type'] == 'image': - lines[i]['related_text'] += last_para_bac - elif line['type'] == 'para': - last_para_bac = line['text'] - elif line['type'] == 'table': - pass - for line in lines: - if line['type'] == 'image': - line['text'] = await self.ocr_tool.image_to_text(line['image'], text=line['related_text']) + lines.append( + { + 'image': Image.open(BytesIO(image_blob)), + 'extension': extension, + 'type': 'image' + } + ) return lines - async def change_lines(self, lines): - """ - 修整处理lines,根据不同的类型(图像、段落、表格)处理每一行,并根据method参数决定处理方式。 - - 参数: - - lines (list): 需要处理的行列表,每行包含内容和类型。 - - method (str): 处理方法,可能是"ocr"、"llm-Enhance"或其他。 - - 返回: - - tuple: 包含处理后的句子列表和图像列表的元组。 - """ - new_lines = [] - images = [] - last_para_id = None - for line in lines: - if line[1] == 'image': - # 处理图像 - image_tuple = line[0] - image_id = self.get_uuid() - image = image_tuple[0] - image_bytes = image.tobytes() - image_extension = image_tuple[1] - await self.insert_image_to_tmp_folder(image_bytes, image_id, image_extension) - if self.parser_method in ['ocr', 'enhanced']: - # 将图片关联到图片的描述chunk上 - chunk_id = self.get_uuid() - new_lines.append({'id': chunk_id, - 'type': 'image'}) - new_lines[-1]['image'] = np.array(image) - images.append({ - 'id': image_id, - 'chunk_id': chunk_id, - 'extension': image_extension, - }) - else: - # 将图片关联到上一个段落chunk上 - images.append({ - 'id': image_id, - 'chunk_id': last_para_id, - 'extension': image_extension, - }) - - elif line[1] == 'para': - # 处理段落 - new_lines.append({'id': self.get_uuid(), - 'text': line[0], - 'type': line[1]}) - last_para_id = new_lines[-1]['id'] - - elif line[1] == 'table': - # 处理表格 - new_lines.append({'id': self.get_uuid(), - 'text': line[0], - 'type': line[1]}) - - if self.parser_method in ['ocr', 'enhanced']: - new_lines = await self.ocr_from_images_in_lines(new_lines) - return new_lines, images - async def parser(self, file_path): """ 解析文件并提取其中的文本和图像信息。 @@ -213,7 +159,7 @@ class DocxService(BaseService): lines = self.get_lines(doc) lines, images = await self.change_lines(lines) - + lines = await self.ocr_from_images_in_lines(lines) chunks = self.build_chunks_by_lines(lines) chunk_links = self.build_chunk_links_by_line(chunks) return chunks, chunk_links, images diff --git a/data_chain/parser/handler/pptx_parser.py b/data_chain/parser/handler/pptx_parser.py new file mode 100644 index 00000000..24164f72 --- /dev/null +++ b/data_chain/parser/handler/pptx_parser.py @@ -0,0 +1,84 @@ + +from pptx import Presentation +import os +from io import BytesIO +from PIL import Image +import numpy as np +from data_chain.parser.handler.base_parser import BaseService +from data_chain.parser.tools.ocr import BaseOCR +from data_chain.logger.logger import logger as logging + + +class PptxService(BaseService): + def __init__(self): + super().__init__() + + async def extract_ppt_content(self, pptx): + lines = [] + + for slide_num, slide in enumerate(pptx.slides, start=1): + for shape in slide.shapes: + # 提取文字 + if shape.has_text_frame: + text = "" + try: + for paragraph in shape.text_frame.paragraphs: + for run in paragraph.runs: + text += run.text + except Exception as e: + logging.error(f"Get text from slide failed due to: {e}") + if text.strip(): + lines.append({ + "text": text, + "type": 'text' + }) + # 提取表格 + elif shape.has_table: + table = shape.table + rows = self.split_table(table) + for row in rows: + lines.append({ + "text": text, + "type": table + }) + # 提取图片 + elif shape.shape_type == 13: # 13 表示图片类型 + try: + image = shape.image + image_ext = os.path.splitext(image.filename)[1] + except Exception as e: + logging.error(f"Extracting image from slide failed due to: {e}") + continue + lines.append({ + "image": Image.open(BytesIO(image.blob)), + "type": table, + "extension": image_ext + }) + + return lines + + async def parser(self, file_path): + """ + 解析文件并提取其中的文本和图像信息。 + + 参数: + - file_path (str): 文件的路径。 + + 返回: + - tuple: 包含分块的文本信息、分块间的链接信息和提取的图像信息的元组。 + 如果文件无法打开或解析失败,则返回 None。 + """ + try: + pptx = Presentation(file_path) + except Exception as e: + print(f"Pptx open failed due to: {e}") + raise e + if self.parser_method != "general": + self.ocr_tool = BaseOCR(llm=self.llm, method=self.parser_method) + lines = await self.extract_ppt_content(pptx) + + lines, images = await self.change_lines(lines) + lines = await self.ocr_from_images_in_lines(lines) + chunks = self.build_chunks_by_lines(lines) + chunk_links = self.build_chunk_links_by_line(chunks) + return chunks, chunk_links, images diff --git a/data_chain/parser/service/parser_service.py b/data_chain/parser/service/parser_service.py index cedac151..bd79fe2e 100644 --- a/data_chain/parser/service/parser_service.py +++ b/data_chain/parser/service/parser_service.py @@ -13,6 +13,7 @@ from data_chain.parser.handler.txt_parser import TxtService from data_chain.parser.handler.pdf_parser import PdfService from data_chain.parser.handler.md_parser import MdService from data_chain.parser.handler.doc_parser import DocService +from data_chain.parser.handler.pptx_parser import PptxService from data_chain.stores.postgres.postgres import ChunkEntity, TemporaryChunkEntity, ChunkLinkEntity, PostgresDB, ImageEntity, TemporaryVectorItemstEntity from data_chain.manager.document_manager import DocumentManager, TemporaryDocumentManager from data_chain.manager.chunk_manager import ChunkManager, ChunkLinkManager, TemporaryChunkManager @@ -36,6 +37,7 @@ class ParserService: ".xlsx": XlsxService, ".md": MdService, ".html": HtmlService, + ".pptx": PptxService, } if not is_temporary_document: self.doc = await DocumentManager.select_by_id(doc_id) @@ -80,7 +82,7 @@ class ParserService: @staticmethod async def update_full_text_to_pg(document_id, full_text, is_temporary_document=False): try: - update_dict={'full_text': full_text} + update_dict = {'full_text': full_text} if not is_temporary_document: await DocumentManager.update(document_id, update_dict) else: diff --git a/data_chain/parser/tools/ocr.py b/data_chain/parser/tools/ocr.py index baabfe58..5f3ceed9 100644 --- a/data_chain/parser/tools/ocr.py +++ b/data_chain/parser/tools/ocr.py @@ -44,7 +44,7 @@ class BaseOCR: logging.error(f"Ocr from image failed due to: {e}") return None - def merge_text_from_ocr_result(ocr_result): + def merge_text_from_ocr_result(self, ocr_result): """ ocr结果文字内容合并接口 参数: @@ -59,7 +59,7 @@ class BaseOCR: logging.error(f'Get text from ocr result failed due to: {e}') return '' - def cut_ocr_result_in_part(ocr_result, max_tokens=1024): + def cut_ocr_result_in_part(self, ocr_result, max_tokens=1024): """ ocr结果切割接口 参数: diff --git a/data_chain/parser/tools/split.py b/data_chain/parser/tools/split.py index 2f6010da..0a8bbb53 100644 --- a/data_chain/parser/tools/split.py +++ b/data_chain/parser/tools/split.py @@ -12,8 +12,8 @@ class SplitTools: except Exception as e: logging.error(f"Get tokens failed due to: {e}") return 0 - - def split_words(text): + + def split_words(self, text): return list(jieba.cut(str(text))) diff --git a/requirements.txt b/requirements.txt index 4897c547..c7a0ecef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,4 +36,5 @@ asyncpg==0.29.0 psycopg2-binary==2.9.9 openpyxl==3.1.2 beautifulsoup4==4.12.3 -tiktoken==0.8.0 \ No newline at end of file +tiktoken==0.8.0 +python-pptx==0.6.23 \ No newline at end of file diff --git a/utils/parser/handler/docx_parser.py b/utils/parser/handler/docx_parser.py index b1be756a..28c0039d 100644 --- a/utils/parser/handler/docx_parser.py +++ b/utils/parser/handler/docx_parser.py @@ -25,7 +25,7 @@ class DocxService(BaseService): doc = docx.Document(file_path) return doc except Exception as e: - logging.error(f"Error opening file {file_path} :{e}") + logging.error(f"Opening docx file {file_path} failed due to:{e}") raise e def is_image(self, graph: Paragraph, doc: Document): @@ -38,7 +38,7 @@ class DocxService(BaseService): return False # 获取run中的所有图片 - def get_imageparts_from_run(self, run, doc: Document): + def get_image_parts_from_run(self, run, doc: Document): image_parts = [] drawings = run._r.xpath('.//w:drawing') # 获取所有图片 for drawing in drawings: @@ -68,7 +68,7 @@ class DocxService(BaseService): while run_index < len(runs): run = runs[run_index] - image_parts = self.get_imageparts_from_run(run, parent) + image_parts = self.get_image_parts_from_run(run, parent) if image_parts: if text_part: lines.append((text_part, 'para')) -- Gitee From 3e48afe141080cea16c42c10d55f7d91006c0042 Mon Sep 17 00:00:00 2001 From: zxstty Date: Wed, 5 Mar 2025 11:49:21 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=B8=B4=E6=97=B6=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=A2=9E=E5=8A=A0pptx=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/router/document.py | 37 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/data_chain/apps/router/document.py b/data_chain/apps/router/document.py index 9a947f63..5d0e86f6 100644 --- a/data_chain/apps/router/document.py +++ b/data_chain/apps/router/document.py @@ -2,7 +2,7 @@ import urllib from typing import Dict, List import uuid -from fastapi import HTTPException,status +from fastapi import HTTPException, status from data_chain.models.service import DocumentDTO, TemporaryDocumentDTO from data_chain.apps.service.user_service import verify_csrf_token, get_user_id, verify_user from data_chain.exceptions.err_code import ErrorCode @@ -156,31 +156,34 @@ async def download(id: uuid.UUID, user_id=Depends(get_user_id)): @router.post('/temporary/related', response_model=BaseResponse[List[uuid.UUID]]) async def related_temporary_doc(req: RelatedTemporaryDocumenRequest): try: - results = await get_related_document(req.content,req.top_k, req.document_ids, req.kb_sn) + results = await get_related_document(req.content, req.top_k, req.document_ids, req.kb_sn) return BaseResponse(data=results) except Exception as e: return BaseResponse(retcode=status.HTTP_500_INTERNAL_SERVER_ERROR, retmsg=str(e), data=None) + @router.post('/temporary/parser', response_model=BaseResponse[List[uuid.UUID]]) async def parser_temporary_doc(req: ParserTemporaryDocumenRequest): try: temporary_document_list = [] for i in range(len(req.document_list)): - tmp_dict=dict(req.document_list[i]) - if tmp_dict['type']=='application/pdf': - tmp_dict['type']='.pdf' - elif tmp_dict['type']=='text/html': - tmp_dict['type']='.html' - elif tmp_dict['type']=='text/plain': - tmp_dict['type']='.txt' - elif tmp_dict['type']=='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': - tmp_dict['type']='.xlsx' - elif tmp_dict['type']=='text/x-markdown': - tmp_dict['type']='.md' - elif tmp_dict['type']=='application/vnd.openxmlformats-officedocument.wordprocessingml.document': - tmp_dict['type']='.docx' - elif tmp_dict['type']=='application/msword': - tmp_dict['type']='.doc' + tmp_dict = dict(req.document_list[i]) + if tmp_dict['type'] == 'application/pdf': + tmp_dict['type'] = '.pdf' + elif tmp_dict['type'] == 'text/html': + tmp_dict['type'] = '.html' + elif tmp_dict['type'] == 'text/plain': + tmp_dict['type'] = '.txt' + elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': + tmp_dict['type'] = '.xlsx' + elif tmp_dict['type'] == 'text/x-markdown': + tmp_dict['type'] = '.md' + elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + tmp_dict['type'] = '.docx' + elif tmp_dict['type'] == 'application/msword': + tmp_dict['type'] = '.doc' + elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation': + tmp_dict['type'] = '.pptx' temporary_document_list.append(tmp_dict) result = await init_temporary_document_parse_task(temporary_document_list) return BaseResponse(data=result) -- Gitee From ecd5ae958fa7cf95353099828129fb089778bc80 Mon Sep 17 00:00:00 2001 From: zxstty Date: Wed, 5 Mar 2025 11:54:26 +0800 Subject: [PATCH 3/3] =?UTF-8?q?base=5Fparser=E7=9A=84tokens=E6=94=B9?= =?UTF-8?q?=E4=B8=BAchunk=5Ftokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/base_parser.py | 12 ++++++------ data_chain/parser/handler/doc_parser.py | 2 +- data_chain/parser/handler/md_parser.py | 2 +- data_chain/parser/handler/txt_parser.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py index d465f525..1b6436cd 100644 --- a/data_chain/parser/handler/base_parser.py +++ b/data_chain/parser/handler/base_parser.py @@ -28,10 +28,10 @@ class BaseService: self.vectorizer = None self.llm_max_tokens = None self.llm = None - self.tokens = None + self.chunk_tokens = None self.ocr_tool = None - async def init_service(self, llm_entity, tokens, parser_method): + async def init_service(self, llm_entity, chunk_tokens, parser_method): self.parser_method = parser_method if llm_entity is not None: self.llm = LLM( @@ -44,7 +44,7 @@ class BaseService: max_tokens=llm_entity.max_tokens, ) self.llm_max_tokens = llm_entity.max_tokens - self.tokens = tokens + self.chunk_tokens = chunk_tokens self.vectorizer = TfidfVectorizer() @staticmethod @@ -82,8 +82,8 @@ class BaseService: if text['text'] == "": continue token_len = split_tools.get_tokens(text) - if now_len + token_len < max(self.tokens // 2, 128) or ( - now_len + token_len < self.tokens and self.check_similarity(now_text, text['text'])): + if now_len + token_len < max(self.chunk_tokens // 2, 128) or ( + now_len + token_len < self.chunk_tokens and self.check_similarity(now_text, text['text'])): now_text += text['text'] + '\n' now_len += token_len else: @@ -152,7 +152,7 @@ class BaseService: logging.error(f"split tables error as{e}") return [] - max_tokens = (self.tokens - cell_num) // cell_num + max_tokens = (self.chunk_tokens - cell_num) // cell_num for row in new_table: new_line = [] max_len = 0 diff --git a/data_chain/parser/handler/doc_parser.py b/data_chain/parser/handler/doc_parser.py index fdb1de61..9e89b9b6 100644 --- a/data_chain/parser/handler/doc_parser.py +++ b/data_chain/parser/handler/doc_parser.py @@ -7,7 +7,7 @@ from data_chain.parser.handler.base_parser import BaseService class DocService(BaseService): def extract_paragraph(self, paragraph): - sentences = self.split_sentences(paragraph, self.tokens) + sentences = self.split_sentences(paragraph, self.chunk_tokens) results = [] for sentence in sentences: results.append({ diff --git a/data_chain/parser/handler/md_parser.py b/data_chain/parser/handler/md_parser.py index be0e6873..0018df0c 100644 --- a/data_chain/parser/handler/md_parser.py +++ b/data_chain/parser/handler/md_parser.py @@ -26,7 +26,7 @@ class MdService(BaseService): else: type = "para" lines = lines[0] - lines = self.split_sentences(lines, self.tokens) + lines = self.split_sentences(lines, self.chunk_tokens) for line in lines: results.append({ 'type': type, diff --git a/data_chain/parser/handler/txt_parser.py b/data_chain/parser/handler/txt_parser.py index 15a53357..5ba62068 100644 --- a/data_chain/parser/handler/txt_parser.py +++ b/data_chain/parser/handler/txt_parser.py @@ -10,7 +10,7 @@ class TxtService(BaseService): # 提取段落分词结果 def extract_paragraph(self, paragraph): - sentences = self.split_sentences(paragraph, self.tokens) + sentences = self.split_sentences(paragraph, self.chunk_tokens) results = [] for sentence in sentences: results.append({ -- Gitee