From 991ccf8f462776540d17e5b184efe0cce8731824 Mon Sep 17 00:00:00 2001
From: zxstty <zhaojiaqi18@huawei.com>
Date: Wed, 5 Mar 2025 11:47:38 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pptx=E6=8F=90=E5=8F=96?=
 =?UTF-8?q?=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_chain/parser/handler/base_parser.py    |  92 ++++++++++++-
 data_chain/parser/handler/docx_parser.py    | 138 ++++++--------------
 data_chain/parser/handler/pptx_parser.py    |  84 ++++++++++++
 data_chain/parser/service/parser_service.py |   4 +-
 data_chain/parser/tools/ocr.py              |   4 +-
 data_chain/parser/tools/split.py            |   4 +-
 requirements.txt                            |   3 +-
 utils/parser/handler/docx_parser.py         |   6 +-
 8 files changed, 224 insertions(+), 111 deletions(-)
 create mode 100644 data_chain/parser/handler/pptx_parser.py

diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py
index 6de4b39b..d465f525 100644
--- a/data_chain/parser/handler/base_parser.py
+++ b/data_chain/parser/handler/base_parser.py
@@ -1,13 +1,14 @@
 import os
 import uuid
 import json
+
+import pptx.table
 from data_chain.logger.logger import logger as logging
 from pandas import DataFrame
 from docx.table import Table as DocxTable
+import pptx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import secrets
-import shutil
 from data_chain.manager.document_manager import DocumentManager
 from data_chain.manager.model_manager import ModelManager
 from data_chain.parser.tools.split import split_tools
@@ -28,6 +29,7 @@ class BaseService:
         self.llm_max_tokens = None
         self.llm = None
         self.tokens = None
+        self.ocr_tool = None
 
     async def init_service(self, llm_entity, tokens, parser_method):
         self.parser_method = parser_method
@@ -38,9 +40,9 @@ class BaseService:
                 openai_api_key=Security.decrypt(
                     llm_entity.encrypted_openai_api_key,
                     json.loads(llm_entity.encrypted_config)
-                    ),
-                    max_tokens=llm_entity.max_tokens, 
-                    )
+                ),
+                max_tokens=llm_entity.max_tokens,
+            )
             self.llm_max_tokens = llm_entity.max_tokens
         self.tokens = tokens
         self.vectorizer = TfidfVectorizer()
@@ -137,7 +139,7 @@ class BaseService:
                     row_string_list = [s.replace('|', '||') for s in row.astype(str).tolist()]
                     cell_num = max(cell_num, len(row_string_list))
                     new_table.append(row_string_list)
-            elif isinstance(table, DocxTable):
+            elif isinstance(table, DocxTable) or isinstance(table, pptx.table.Table):
                 if table.rows:
                     for row in table.rows:
                         row_string_list = [s.replace('|', '||') for s in (cell.text.strip() for cell in row.cells)]
@@ -167,6 +169,84 @@ class BaseService:
 
         return result
 
+    async def ocr_from_images_in_lines(self, lines):
+        # 获取图像相邻文本
+        last_para_pre = ""
+        for i in range(len(lines)):
+            line = lines[i]
+            if line['type'] == 'image':
+                lines[i]['related_text'] = last_para_pre
+            elif line['type'] == 'para':
+                last_para_pre = line['text']
+            elif line['type'] == 'table':
+                pass
+        last_para_bac = ""
+        for i in range(len(lines) - 1, -1, -1):
+            line = lines[i]
+            if line['type'] == 'image':
+                lines[i]['related_text'] += last_para_bac
+            elif line['type'] == 'para':
+                last_para_bac = line['text']
+            elif line['type'] == 'table':
+                pass
+        for line in lines:
+            if line['type'] == 'image':
+                line['text'] = await self.ocr_tool.image_to_text(line['image'], text=line['related_text'])
+        return lines
+
+    async def change_lines(self, lines):
+        """
+        修整处理lines，根据不同的类型（图像、段落、表格）处理每一行，并根据method参数决定处理方式。
+
+        参数:
+        - lines (list): 需要处理的行列表，每行包含内容和类型。
+        返回:
+        - tuple: 包含处理后的句子列表和图像列表的元组。
+        """
+        new_lines = []
+        images = []
+        last_para_id = None
+        for line in lines:
+            if line['type'] == 'image':
+                # 处理图像
+                image_id = self.get_uuid()
+                image = line['image']
+                image_bytes = image.tobytes()
+                image_extension = line['extension']
+                await self.insert_image_to_tmp_folder(image_bytes, image_id, image_extension)
+                if self.parser_method in ['ocr', 'enhanced']:
+                    # 将图片关联到图片的描述chunk上
+                    chunk_id = self.get_uuid()
+                    new_lines.append({'id': chunk_id,
+                                      'type': 'image'})
+                    new_lines[-1]['image'] = np.array(image)
+                    images.append({
+                        'id': image_id,
+                        'chunk_id': chunk_id,
+                        'extension': image_extension,
+                    })
+                else:
+                    # 将图片关联到上一个段落chunk上
+                    images.append({
+                        'id': image_id,
+                        'chunk_id': last_para_id,
+                        'extension': image_extension,
+                    })
+
+            elif line['type'] == 'para':
+                # 处理段落
+                new_lines.append({'id': self.get_uuid(),
+                                  'text': line['text'],
+                                  'type': line['type']})
+                last_para_id = new_lines[-1]['id']
+
+            elif line[1] == 'table':
+                # 处理表格
+                new_lines.append({'id': self.get_uuid(),
+                                  'text': line['text'],
+                                  'type': line['type']})
+        return new_lines, images
+
     def package_to_chunk(self, **kwargs):
         """
         整合成chunk
diff --git a/data_chain/parser/handler/docx_parser.py b/data_chain/parser/handler/docx_parser.py
index 120b9931..74ebcbaf 100644
--- a/data_chain/parser/handler/docx_parser.py
+++ b/data_chain/parser/handler/docx_parser.py
@@ -1,10 +1,7 @@
-from data_chain.logger.logger import logger as logging
 import docx
-import uuid
 from io import BytesIO
 from PIL import Image
 import numpy as np
-import docx
 from docx.document import Document
 from docx.text.paragraph import Paragraph
 from docx.parts.image import ImagePart
@@ -15,12 +12,12 @@ from docx.oxml.shape import CT_Picture
 import mimetypes
 from data_chain.parser.handler.base_parser import BaseService
 from data_chain.parser.tools.ocr import BaseOCR
+from data_chain.logger.logger import logger as logging
 
 
 class DocxService(BaseService):
     def __init__(self):
         super().__init__()
-        self.ocr_tool = None
 
     def open_file(self, file_path):
         try:
@@ -73,29 +70,56 @@ class DocxService(BaseService):
                         image_parts = self.get_imageparts_from_run(run, parent)
                         if image_parts:
                             if text_part:
-                                lines.append((text_part, 'para'))
+                                lines.append(
+                                    {
+                                        'text': text_part,
+                                        'type': 'text'
+                                    }
+                                )
                                 text_part = ''
                             for image_part in image_parts:
                                 try:
                                     image_blob = image_part.image.blob
                                     content_type = image_part.content_type
                                 except Exception as e:
-                                    logging.error(f"Image blob and part get failed due to :{e}")
+                                    logging.error(f"Get Image blob and part failed due to :{e}")
+                                    continue
                                 extension = mimetypes.guess_extension(content_type).replace('.', '')
-                                lines.append(((Image.open(BytesIO(image_blob)), extension), 'image'))
+                                lines.append(
+                                    {
+                                        'image': Image.open(BytesIO(image_blob)),
+                                        'extension': extension,
+                                        'type': 'image'
+                                    }
+                                )
                         else:
                             text_part += run.text
                         run_index += 1
 
                     if text_part:
-                        lines.append((text_part, 'para'))
+                        lines.append(
+                            {
+                                'text': text_part,
+                                'type': 'text'
+                            }
+                        )
                 else:
-                    lines.append((paragraph.text, 'para'))
+                    lines.append(
+                        {
+                            'text': paragraph.text,
+                            'type': 'text'
+                        }
+                    )
             elif isinstance(child, CT_Tbl):
                 table = Table(child, parent)
                 rows = self.split_table(table)
                 for row in rows:
-                    lines.append((row, 'table'))
+                    lines.append(
+                        {
+                            'text': row,
+                            'type': 'table'
+                        }
+                    )
             elif isinstance(child, CT_Picture):
                 img_id = child.xpath('.//a:blip/@r:embed')[0]
                 part = parent.part.related_parts[img_id]
@@ -107,93 +131,15 @@ class DocxService(BaseService):
                         logging.error(f'Get image blob and content type failed due to: {e}')
                         continue
                     extension = mimetypes.guess_extension(content_type).replace('.', '')
-                    lines.append(((Image.open(BytesIO(image_blob)), extension), 'image'))
-        return lines
-
-    async def ocr_from_images_in_lines(self, lines):
-        # 获取图像相邻文本
-        last_para_pre = ""
-        for i in range(len(lines)):
-            line = lines[i]
-            if line['type'] == 'image':
-                lines[i]['related_text'] = last_para_pre
-            elif line['type'] == 'para':
-                last_para_pre = line['text']
-            elif line['type'] == 'table':
-                pass
-        last_para_bac = ""
-        for i in range(len(lines) - 1, -1, -1):
-            line = lines[i]
-            if line['type'] == 'image':
-                lines[i]['related_text'] += last_para_bac
-            elif line['type'] == 'para':
-                last_para_bac = line['text']
-            elif line['type'] == 'table':
-                pass
-        for line in lines:
-            if line['type'] == 'image':
-                line['text'] = await self.ocr_tool.image_to_text(line['image'], text=line['related_text'])
+                    lines.append(
+                        {
+                            'image': Image.open(BytesIO(image_blob)),
+                            'extension': extension,
+                            'type': 'image'
+                        }
+                    )
         return lines
 
-    async def change_lines(self, lines):
-        """
-        修整处理lines，根据不同的类型（图像、段落、表格）处理每一行，并根据method参数决定处理方式。
-
-        参数:
-        - lines (list): 需要处理的行列表，每行包含内容和类型。
-        - method (str): 处理方法，可能是"ocr"、"llm-Enhance"或其他。
-
-        返回:
-        - tuple: 包含处理后的句子列表和图像列表的元组。
-        """
-        new_lines = []
-        images = []
-        last_para_id = None
-        for line in lines:
-            if line[1] == 'image':
-                # 处理图像
-                image_tuple = line[0]
-                image_id = self.get_uuid()
-                image = image_tuple[0]
-                image_bytes = image.tobytes()
-                image_extension = image_tuple[1]
-                await self.insert_image_to_tmp_folder(image_bytes, image_id, image_extension)
-                if self.parser_method in ['ocr', 'enhanced']:
-                    # 将图片关联到图片的描述chunk上
-                    chunk_id = self.get_uuid()
-                    new_lines.append({'id': chunk_id,
-                                      'type': 'image'})
-                    new_lines[-1]['image'] = np.array(image)
-                    images.append({
-                        'id': image_id,
-                        'chunk_id': chunk_id,
-                        'extension': image_extension,
-                    })
-                else:
-                    # 将图片关联到上一个段落chunk上
-                    images.append({
-                        'id': image_id,
-                        'chunk_id': last_para_id,
-                        'extension': image_extension,
-                    })
-
-            elif line[1] == 'para':
-                # 处理段落
-                new_lines.append({'id': self.get_uuid(),
-                                  'text': line[0],
-                                  'type': line[1]})
-                last_para_id = new_lines[-1]['id']
-
-            elif line[1] == 'table':
-                # 处理表格
-                new_lines.append({'id': self.get_uuid(),
-                                  'text': line[0],
-                                  'type': line[1]})
-
-        if self.parser_method in ['ocr', 'enhanced']:
-            new_lines = await self.ocr_from_images_in_lines(new_lines)
-        return new_lines, images
-
     async def parser(self, file_path):
         """
         解析文件并提取其中的文本和图像信息。
@@ -213,7 +159,7 @@ class DocxService(BaseService):
         lines = self.get_lines(doc)
 
         lines, images = await self.change_lines(lines)
-
+        lines = await self.ocr_from_images_in_lines(lines)
         chunks = self.build_chunks_by_lines(lines)
         chunk_links = self.build_chunk_links_by_line(chunks)
         return chunks, chunk_links, images
diff --git a/data_chain/parser/handler/pptx_parser.py b/data_chain/parser/handler/pptx_parser.py
new file mode 100644
index 00000000..24164f72
--- /dev/null
+++ b/data_chain/parser/handler/pptx_parser.py
@@ -0,0 +1,84 @@
+
+from pptx import Presentation
+import os
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from data_chain.parser.handler.base_parser import BaseService
+from data_chain.parser.tools.ocr import BaseOCR
+from data_chain.logger.logger import logger as logging
+
+
+class PptxService(BaseService):
+    def __init__(self):
+        super().__init__()
+
+    async def extract_ppt_content(self, pptx):
+        lines = []
+
+        for slide_num, slide in enumerate(pptx.slides, start=1):
+            for shape in slide.shapes:
+                # 提取文字
+                if shape.has_text_frame:
+                    text = ""
+                    try:
+                        for paragraph in shape.text_frame.paragraphs:
+                            for run in paragraph.runs:
+                                text += run.text
+                    except Exception as e:
+                        logging.error(f"Get text from slide failed due to: {e}")
+                    if text.strip():
+                        lines.append({
+                            "text": text,
+                            "type": 'text'
+                        })
+                # 提取表格
+                elif shape.has_table:
+                    table = shape.table
+                    rows = self.split_table(table)
+                    for row in rows:
+                        lines.append({
+                            "text": text,
+                            "type": table
+                        })
+                # 提取图片
+                elif shape.shape_type == 13:  # 13 表示图片类型
+                    try:
+                        image = shape.image
+                        image_ext = os.path.splitext(image.filename)[1]
+                    except Exception as e:
+                        logging.error(f"Extracting image from slide failed due to: {e}")
+                        continue
+                    lines.append({
+                        "image": Image.open(BytesIO(image.blob)),
+                        "type": table,
+                        "extension": image_ext
+                    })
+
+        return lines
+
+    async def parser(self, file_path):
+        """
+        解析文件并提取其中的文本和图像信息。
+
+        参数:
+        - file_path (str): 文件的路径。
+
+        返回:
+        - tuple: 包含分块的文本信息、分块间的链接信息和提取的图像信息的元组。
+               如果文件无法打开或解析失败，则返回 None。
+        """
+        try:
+            pptx = Presentation(file_path)
+        except Exception as e:
+            print(f"Pptx open failed due to: {e}")
+            raise e
+        if self.parser_method != "general":
+            self.ocr_tool = BaseOCR(llm=self.llm, method=self.parser_method)
+        lines = await self.extract_ppt_content(pptx)
+
+        lines, images = await self.change_lines(lines)
+        lines = await self.ocr_from_images_in_lines(lines)
+        chunks = self.build_chunks_by_lines(lines)
+        chunk_links = self.build_chunk_links_by_line(chunks)
+        return chunks, chunk_links, images
diff --git a/data_chain/parser/service/parser_service.py b/data_chain/parser/service/parser_service.py
index cedac151..bd79fe2e 100644
--- a/data_chain/parser/service/parser_service.py
+++ b/data_chain/parser/service/parser_service.py
@@ -13,6 +13,7 @@ from data_chain.parser.handler.txt_parser import TxtService
 from data_chain.parser.handler.pdf_parser import PdfService
 from data_chain.parser.handler.md_parser import MdService
 from data_chain.parser.handler.doc_parser import DocService
+from data_chain.parser.handler.pptx_parser import PptxService
 from data_chain.stores.postgres.postgres import ChunkEntity, TemporaryChunkEntity, ChunkLinkEntity, PostgresDB, ImageEntity, TemporaryVectorItemstEntity
 from data_chain.manager.document_manager import DocumentManager, TemporaryDocumentManager
 from data_chain.manager.chunk_manager import ChunkManager, ChunkLinkManager, TemporaryChunkManager
@@ -36,6 +37,7 @@ class ParserService:
             ".xlsx": XlsxService,
             ".md": MdService,
             ".html": HtmlService,
+            ".pptx": PptxService,
         }
         if not is_temporary_document:
             self.doc = await DocumentManager.select_by_id(doc_id)
@@ -80,7 +82,7 @@ class ParserService:
     @staticmethod
     async def update_full_text_to_pg(document_id, full_text, is_temporary_document=False):
         try:
-            update_dict={'full_text': full_text}
+            update_dict = {'full_text': full_text}
             if not is_temporary_document:
                 await DocumentManager.update(document_id, update_dict)
             else:
diff --git a/data_chain/parser/tools/ocr.py b/data_chain/parser/tools/ocr.py
index baabfe58..5f3ceed9 100644
--- a/data_chain/parser/tools/ocr.py
+++ b/data_chain/parser/tools/ocr.py
@@ -44,7 +44,7 @@ class BaseOCR:
             logging.error(f"Ocr from image failed due to: {e}")
             return None
 
-    def merge_text_from_ocr_result(ocr_result):
+    def merge_text_from_ocr_result(self, ocr_result):
         """
         ocr结果文字内容合并接口
         参数：
@@ -59,7 +59,7 @@ class BaseOCR:
             logging.error(f'Get text from ocr result failed due to: {e}')
             return ''
 
-    def cut_ocr_result_in_part(ocr_result, max_tokens=1024):
+    def cut_ocr_result_in_part(self, ocr_result, max_tokens=1024):
         """
         ocr结果切割接口
         参数：
diff --git a/data_chain/parser/tools/split.py b/data_chain/parser/tools/split.py
index 2f6010da..0a8bbb53 100644
--- a/data_chain/parser/tools/split.py
+++ b/data_chain/parser/tools/split.py
@@ -12,8 +12,8 @@ class SplitTools:
         except Exception as e:
             logging.error(f"Get tokens failed due to: {e}")
             return 0
-    
-    def split_words(text):
+
+    def split_words(self, text):
         return list(jieba.cut(str(text)))
 
 
diff --git a/requirements.txt b/requirements.txt
index 4897c547..c7a0ecef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,4 +36,5 @@ asyncpg==0.29.0
 psycopg2-binary==2.9.9
 openpyxl==3.1.2
 beautifulsoup4==4.12.3
-tiktoken==0.8.0
\ No newline at end of file
+tiktoken==0.8.0
+python-pptx==0.6.23
\ No newline at end of file
diff --git a/utils/parser/handler/docx_parser.py b/utils/parser/handler/docx_parser.py
index b1be756a..28c0039d 100644
--- a/utils/parser/handler/docx_parser.py
+++ b/utils/parser/handler/docx_parser.py
@@ -25,7 +25,7 @@ class DocxService(BaseService):
             doc = docx.Document(file_path)
             return doc
         except Exception as e:
-            logging.error(f"Error opening file {file_path} :{e}")
+            logging.error(f"Opening docx file {file_path} failed due to:{e}")
             raise e
 
     def is_image(self, graph: Paragraph, doc: Document):
@@ -38,7 +38,7 @@ class DocxService(BaseService):
         return False
 
     # 获取run中的所有图片
-    def get_imageparts_from_run(self, run, doc: Document):
+    def get_image_parts_from_run(self, run, doc: Document):
         image_parts = []
         drawings = run._r.xpath('.//w:drawing')  # 获取所有图片
         for drawing in drawings:
@@ -68,7 +68,7 @@ class DocxService(BaseService):
 
                     while run_index < len(runs):
                         run = runs[run_index]
-                        image_parts = self.get_imageparts_from_run(run, parent)
+                        image_parts = self.get_image_parts_from_run(run, parent)
                         if image_parts:
                             if text_part:
                                 lines.append((text_part, 'para'))
-- 
Gitee


From 3e48afe141080cea16c42c10d55f7d91006c0042 Mon Sep 17 00:00:00 2001
From: zxstty <zhaojiaqi18@huawei.com>
Date: Wed, 5 Mar 2025 11:49:21 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E4=B8=B4=E6=97=B6=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=A2=9E=E5=8A=A0pptx=E6=8F=90=E5=8F=96?=
 =?UTF-8?q?=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_chain/apps/router/document.py | 37 ++++++++++++++++--------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/data_chain/apps/router/document.py b/data_chain/apps/router/document.py
index 9a947f63..5d0e86f6 100644
--- a/data_chain/apps/router/document.py
+++ b/data_chain/apps/router/document.py
@@ -2,7 +2,7 @@
 import urllib
 from typing import Dict, List
 import uuid
-from fastapi import HTTPException,status
+from fastapi import HTTPException, status
 from data_chain.models.service import DocumentDTO, TemporaryDocumentDTO
 from data_chain.apps.service.user_service import verify_csrf_token, get_user_id, verify_user
 from data_chain.exceptions.err_code import ErrorCode
@@ -156,31 +156,34 @@ async def download(id: uuid.UUID, user_id=Depends(get_user_id)):
 @router.post('/temporary/related', response_model=BaseResponse[List[uuid.UUID]])
 async def related_temporary_doc(req: RelatedTemporaryDocumenRequest):
     try:
-        results = await get_related_document(req.content,req.top_k, req.document_ids, req.kb_sn)
+        results = await get_related_document(req.content, req.top_k, req.document_ids, req.kb_sn)
         return BaseResponse(data=results)
     except Exception as e:
         return BaseResponse(retcode=status.HTTP_500_INTERNAL_SERVER_ERROR, retmsg=str(e), data=None)
 
+
 @router.post('/temporary/parser', response_model=BaseResponse[List[uuid.UUID]])
 async def parser_temporary_doc(req: ParserTemporaryDocumenRequest):
     try:
         temporary_document_list = []
         for i in range(len(req.document_list)):
-            tmp_dict=dict(req.document_list[i])
-            if tmp_dict['type']=='application/pdf':
-                tmp_dict['type']='.pdf'
-            elif tmp_dict['type']=='text/html':
-                tmp_dict['type']='.html'
-            elif tmp_dict['type']=='text/plain':
-                tmp_dict['type']='.txt'
-            elif tmp_dict['type']=='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
-                tmp_dict['type']='.xlsx'
-            elif tmp_dict['type']=='text/x-markdown':
-                tmp_dict['type']='.md'
-            elif tmp_dict['type']=='application/vnd.openxmlformats-officedocument.wordprocessingml.document':
-                tmp_dict['type']='.docx'
-            elif tmp_dict['type']=='application/msword':
-                tmp_dict['type']='.doc'
+            tmp_dict = dict(req.document_list[i])
+            if tmp_dict['type'] == 'application/pdf':
+                tmp_dict['type'] = '.pdf'
+            elif tmp_dict['type'] == 'text/html':
+                tmp_dict['type'] = '.html'
+            elif tmp_dict['type'] == 'text/plain':
+                tmp_dict['type'] = '.txt'
+            elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+                tmp_dict['type'] = '.xlsx'
+            elif tmp_dict['type'] == 'text/x-markdown':
+                tmp_dict['type'] = '.md'
+            elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+                tmp_dict['type'] = '.docx'
+            elif tmp_dict['type'] == 'application/msword':
+                tmp_dict['type'] = '.doc'
+            elif tmp_dict['type'] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
+                tmp_dict['type'] = '.pptx'
             temporary_document_list.append(tmp_dict)
         result = await init_temporary_document_parse_task(temporary_document_list)
         return BaseResponse(data=result)
-- 
Gitee


From ecd5ae958fa7cf95353099828129fb089778bc80 Mon Sep 17 00:00:00 2001
From: zxstty <zhaojiaqi18@huawei.com>
Date: Wed, 5 Mar 2025 11:54:26 +0800
Subject: [PATCH 3/3] =?UTF-8?q?base=5Fparser=E7=9A=84tokens=E6=94=B9?=
 =?UTF-8?q?=E4=B8=BAchunk=5Ftokens?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_chain/parser/handler/base_parser.py | 12 ++++++------
 data_chain/parser/handler/doc_parser.py  |  2 +-
 data_chain/parser/handler/md_parser.py   |  2 +-
 data_chain/parser/handler/txt_parser.py  |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/data_chain/parser/handler/base_parser.py b/data_chain/parser/handler/base_parser.py
index d465f525..1b6436cd 100644
--- a/data_chain/parser/handler/base_parser.py
+++ b/data_chain/parser/handler/base_parser.py
@@ -28,10 +28,10 @@ class BaseService:
         self.vectorizer = None
         self.llm_max_tokens = None
         self.llm = None
-        self.tokens = None
+        self.chunk_tokens = None
         self.ocr_tool = None
 
-    async def init_service(self, llm_entity, tokens, parser_method):
+    async def init_service(self, llm_entity, chunk_tokens, parser_method):
         self.parser_method = parser_method
         if llm_entity is not None:
             self.llm = LLM(
@@ -44,7 +44,7 @@ class BaseService:
                 max_tokens=llm_entity.max_tokens,
             )
             self.llm_max_tokens = llm_entity.max_tokens
-        self.tokens = tokens
+        self.chunk_tokens = chunk_tokens
         self.vectorizer = TfidfVectorizer()
 
     @staticmethod
@@ -82,8 +82,8 @@ class BaseService:
                 if text['text'] == "":
                     continue
                 token_len = split_tools.get_tokens(text)
-                if now_len + token_len < max(self.tokens // 2, 128) or (
-                        now_len + token_len < self.tokens and self.check_similarity(now_text, text['text'])):
+                if now_len + token_len < max(self.chunk_tokens // 2, 128) or (
+                        now_len + token_len < self.chunk_tokens and self.check_similarity(now_text, text['text'])):
                     now_text += text['text'] + '\n'
                     now_len += token_len
                 else:
@@ -152,7 +152,7 @@ class BaseService:
             logging.error(f"split tables error as{e}")
             return []
 
-        max_tokens = (self.tokens - cell_num) // cell_num
+        max_tokens = (self.chunk_tokens - cell_num) // cell_num
         for row in new_table:
             new_line = []
             max_len = 0
diff --git a/data_chain/parser/handler/doc_parser.py b/data_chain/parser/handler/doc_parser.py
index fdb1de61..9e89b9b6 100644
--- a/data_chain/parser/handler/doc_parser.py
+++ b/data_chain/parser/handler/doc_parser.py
@@ -7,7 +7,7 @@ from data_chain.parser.handler.base_parser import BaseService
 
 class DocService(BaseService):
     def extract_paragraph(self, paragraph):
-        sentences = self.split_sentences(paragraph, self.tokens)
+        sentences = self.split_sentences(paragraph, self.chunk_tokens)
         results = []
         for sentence in sentences:
             results.append({
diff --git a/data_chain/parser/handler/md_parser.py b/data_chain/parser/handler/md_parser.py
index be0e6873..0018df0c 100644
--- a/data_chain/parser/handler/md_parser.py
+++ b/data_chain/parser/handler/md_parser.py
@@ -26,7 +26,7 @@ class MdService(BaseService):
         else:
             type = "para"
             lines = lines[0]
-            lines = self.split_sentences(lines, self.tokens)
+            lines = self.split_sentences(lines, self.chunk_tokens)
         for line in lines:
             results.append({
                 'type': type,
diff --git a/data_chain/parser/handler/txt_parser.py b/data_chain/parser/handler/txt_parser.py
index 15a53357..5ba62068 100644
--- a/data_chain/parser/handler/txt_parser.py
+++ b/data_chain/parser/handler/txt_parser.py
@@ -10,7 +10,7 @@ class TxtService(BaseService):
 
     # 提取段落分词结果
     def extract_paragraph(self, paragraph):
-        sentences = self.split_sentences(paragraph, self.tokens)
+        sentences = self.split_sentences(paragraph, self.chunk_tokens)
         results = []
         for sentence in sentences:
             results.append({
-- 
Gitee