diff --git a/README.md b/README.md index a89c41d77633c2ecc7281237b28c94a4539745b6..737a000381e7b9959a507f75df87058ff1f6c342 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,16 @@

-中文 | [English](./README_en.md) | [日本語](./README_ja.md) +中文 | [English](./README_en.md) [![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR) [![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/) -![python](https://img.shields.io/badge/python-3.8+-aff.svg) +![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg) ![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg) +![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg) + [![Website](https://img.shields.io/badge/Website-PaddleOCR-blue?logo=)](https://www.paddleocr.ai/) [![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI) @@ -21,12 +23,12 @@ ## 🚀 简介 -PaddleOCR自发布以来凭借学术前沿算法和产业落地实践,受到了产学研各方的喜爱,并被广泛应用于众多知名开源项目,例如:Umi-OCR、OmniParser、MinerU、RAGFlow等,已成为广大开发者心中的开源OCR领域的首选工具。2025年5月20日,飞桨团队发布**PaddleOCR 3.0**,全面适配**飞桨框架3.0正式版**,进一步**提升文字识别精度**,支持**多文字类型识别**和**手写体识别**,满足大模型应用对**复杂文档高精度解析**的旺盛需求,结合**文心大模型4.5 Turbo**显著提升关键信息抽取精度,并新增**对昆仑芯、昇腾等国产硬件**的支持。 +PaddleOCR自发布以来凭借学术前沿算法和产业落地实践,受到了产学研各方的喜爱,并被广泛应用于众多知名开源项目,例如:Umi-OCR、OmniParser、MinerU、RAGFlow等,已成为广大开发者心中的开源OCR领域的首选工具。2025年5月20日,飞桨团队发布**PaddleOCR 3.0**,全面适配**飞桨框架3.0正式版**,进一步**提升文字识别精度**,支持**多文字类型识别**和**手写体识别**,满足大模型应用对**复杂文档高精度解析**的旺盛需求,结合**文心大模型4.5 Turbo**显著提升关键信息抽取精度,并新增**对昆仑芯、昇腾等国产硬件**的支持。完整使用文档请参考 [PaddleOCR 3.0 文档](https://paddlepaddle.github.io/PaddleOCR/latest/)。 -PaddleOCR 3.0**新增**三大特色能力:: -- 全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。 -- 通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。 -- 智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支持文心大模型4.5 Turbo,精度相比上一代**提升15个百分点**。 +PaddleOCR 3.0**新增**三大特色能力: +- 全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。[在线体验](https://aistudio.baidu.com/community/app/91660/webUI) +- 通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。[在线体验](https://aistudio.baidu.com/community/app/518494/webUI) +- 智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支持文心大模型4.5 Turbo,精度相比上一代**提升15个百分点**。[在线体验](https://aistudio.baidu.com/community/app/518493/webUI) PaddleOCR 3.0除了提供优秀的模型库外,还提供好学易用的工具,覆盖模型训练、推理和服务化部署,方便开发者快速落地AI应用。
@@ -37,6 +39,14 @@ PaddleOCR 3.0除了提供优秀的模型库外,还提供好学易用的工具 ## 📣 最新动态 +🔥🔥2025.06.05: **PaddleOCR 3.0.1** 发布,包含: + +- **优化部分模型和模型配置:** + - 更新 PP-OCRv5默认模型配置,检测和识别均由mobile改为server模型。为了改善大多数的场景默认效果,配置中的参数`limit_side_len`由736改为64 + - 新增文本行方向分类`PP-LCNet_x1_0_textline_ori`模型,精度99.42%,OCR、PP-StructureV3、PP-ChatOCRv4产线的默认文本行方向分类器改为该模型 + - 优化文本行方向分类`PP-LCNet_x0_25_textline_ori`模型,精度提升3.3个百分点,当前精度98.85% +- **优化和修复3.0.0版本部分存在的问题,[详情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)** + 🔥🔥2025.05.20: **PaddleOCR 3.0** 正式发布,包含: - **PP-OCRv5**: 全场景高精度文字识别 @@ -74,13 +84,13 @@ pip install paddleocr ### 3. 命令行方式推理 ```bash # 运行 PP-OCRv5 推理 -paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png +paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False # 运行 PP-StructureV3 推理 -paddleocr PP-StructureV3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png +paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False -# 运行 PP-ChatOCRv4 推理前,需要先获得千帆KPI Key -paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key +# 运行 PP-ChatOCRv4 推理前,需要先获得千帆API Key +paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False # 查看 "paddleocr ocr" 详细参数 paddleocr ocr --help @@ -91,9 +101,13 @@ paddleocr ocr --help ```python from paddleocr import PaddleOCR # 初始化 PaddleOCR 实例 -ocr = PaddleOCR() +ocr = PaddleOCR( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False) # 对示例图像执行 OCR 推理 -result = ocr.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png") +result = ocr.predict( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png") # 可视化结果并保存 json 结果 for res in result: res.print() @@ -108,45 +122,21 @@ for res in result: from pathlib import Path from paddleocr import PPStructureV3 -pipeline = PPStructureV3() +pipeline = PPStructureV3( + use_doc_orientation_classify=False, + use_doc_unwarping=False +) # For Image -output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png") +output = pipeline.predict( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png", + ) # 可视化结果并保存 json 结果 for res in output: res.print() res.save_to_json(save_path="output") res.save_to_markdown(save_path="output") - -# For PDF File -input_file = "./your_pdf_file.pdf" -output_path = Path("./output") - -output = pipeline.predict(input_file) - -markdown_list = [] -markdown_images = [] - -for res in output: - md_info = res.markdown - markdown_list.append(md_info) - markdown_images.append(md_info.get("markdown_images", {})) - -markdown_texts = pipeline.concatenate_markdown_pages(markdown_list) - -mkd_file_path = output_path / f"{Path(input_file).stem}.md" -mkd_file_path.parent.mkdir(parents=True, exist_ok=True) - -with open(mkd_file_path, "w", encoding="utf-8") as f: - f.write(markdown_texts) - -for item in markdown_images: - if item: - for path, image in item.items(): - file_path = output_path / path - file_path.parent.mkdir(parents=True, exist_ok=True) - image.save(file_path) ``` @@ -174,25 +164,37 @@ retriever_config = { "api_key": "api_key", # your api_key } -mllm_chat_bot_config = { - "module_name": "chat_bot", - "model_name": "PP-DocBee", - "base_url": "http://127.0.0.1:8080/", # your local mllm service url - "api_type": "openai", - "api_key": "api_key", # your api_key -} - -pipeline = PPChatOCRv4Doc() +pipeline = PPChatOCRv4Doc( + use_doc_orientation_classify=False, + use_doc_unwarping=False +) visual_predict_res = pipeline.visual_predict( input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png", - use_doc_orientation_classify=False, - use_doc_unwarping=False, use_common_ocr=True, use_seal_recognition=True, use_table_recognition=True, ) +mllm_predict_info = None +use_mllm = False +# 如果使用多模态大模型,需要启动本地 mllm 服务,可以参考文档:https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 进行部署,并更新 mllm_chat_bot_config 配置。 +if use_mllm: + mllm_chat_bot_config = { + "module_name": "chat_bot", + "model_name": "PP-DocBee", + "base_url": "http://127.0.0.1:8080/", # your local mllm service url + "api_type": "openai", + "api_key": "api_key", # your api_key + } + + mllm_predict_res = pipeline.mllm_pred( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png", + key_list=["驾驶室准乘人数"], + mllm_chat_bot_config=mllm_chat_bot_config, + ) + mllm_predict_info = mllm_predict_res["mllm_res"] + visual_info_list = [] for res in visual_predict_res: visual_info_list.append(res["visual_info"]) @@ -201,12 +203,6 @@ for res in visual_predict_res: vector_info = pipeline.build_vector( visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config ) -mllm_predict_res = pipeline.mllm_pred( - input="vehicle_certificate-1.png", - key_list=["驾驶室准乘人数"], - mllm_chat_bot_config=mllm_chat_bot_config, -) -mllm_predict_info = mllm_predict_res["mllm_res"] chat_result = pipeline.chat( key_list=["驾驶室准乘人数"], visual_info=visual_info_list, diff --git a/README_en.md b/README_en.md index 32d98761557f4b40750660b73f20007274cc137a..37d0907868b1a5d3f2e68a749e62d37e670d409e 100644 --- a/README_en.md +++ b/README_en.md @@ -10,8 +10,9 @@ [![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR) [![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/) -![python](https://img.shields.io/badge/python-3.8+-aff.svg) +![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg) ![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg) +![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg) [![Website](https://img.shields.io/badge/Website-PaddleOCR-blue?logo=)](https://www.paddleocr.ai/) [![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI) @@ -23,14 +24,14 @@ ## 🚀 Introduction Since its initial release, PaddleOCR has gained widespread acclaim across academia, industry, and research communities, thanks to its cutting-edge algorithms and proven performance in real-world applications. It’s already powering popular open-source projects like Umi-OCR, OmniParser, MinerU, and RAGFlow, making it the go-to OCR toolkit for developers worldwide. -On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the **PaddlePaddle 3.0** framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5T**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for domestic hardware platforms such as **KUNLUNXIN** and **Ascend**. +On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the **PaddlePaddle 3.0** framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5T**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for domestic hardware platforms such as **KUNLUNXIN** and **Ascend**. For the complete usage documentation, please refer to the [PaddleOCR 3.0 Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html). -Three Major New Features in PaddleOCR 3.0 -- Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. +Three Major New Features in PaddleOCR 3.0: +- Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. [Online Demo](https://aistudio.baidu.com/community/app/91660/webUI) -- General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. +- General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. [Online Demo](https://aistudio.baidu.com/community/app/518494/webUI) -- Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the WenXin large model 4.5T, achieving 15 percentage points higher accuracy than its predecessor. +- Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the WenXin large model 4.5T, achieving 15 percentage points higher accuracy than its predecessor. [Online Demo](https://aistudio.baidu.com/community/app/518493/webUI) In addition to providing an outstanding model library, PaddleOCR 3.0 also offers user-friendly tools covering model training, inference, and service deployment, so developers can rapidly bring AI applications to production.
@@ -42,9 +43,19 @@ In addition to providing an outstanding model library, PaddleOCR 3.0 also offers ## 📣 Recent updates + +#### **🔥🔥 2025.06.05: Release of PaddleOCR 3.0.1, includes:** + +- **Optimisation of certain models and model configurations:** + - Updated the default model configuration for PP-OCRv5, changing both detection and recognition from mobile to server models. To improve default performance in most scenarios, the parameter `limit_side_len` in the configuration has been changed from 736 to 64. + - Added a new text line orientation classification model `PP-LCNet_x1_0_textline_ori` with an accuracy of 99.42%. The default text line orientation classifier for OCR, PP-StructureV3, and PP-ChatOCRv4 pipelines has been updated to this model. + - Optimised the text line orientation classification model `PP-LCNet_x0_25_textline_ori`, improving accuracy by 3.3 percentage points to a current accuracy of 98.85%. + +- **Optimizations and fixes for some issues in version 3.0.0, [details](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)** + 🔥🔥2025.05.20: Official Release of **PaddleOCR v3.0**, including: - **PP-OCRv5**: High-Accuracy Text Recognition Model for All Scenarios - Instant Text from Images/PDFs. - 1. 🌐 Single-model support for **five** text types - Seamlessly process **Simplified Chinese, Traditional Chinese, Simplified Chinese Pinyin, English** and **Japanse** within a single model. + 1. 🌐 Single-model support for **five** text types - Seamlessly process **Simplified Chinese, Traditional Chinese, Simplified Chinese Pinyin, English** and **Japanese** within a single model. 2. ✍️ Improved **handwriting recognition**: Significantly better at complex cursive scripts and non-standard handwriting. 3. 🎯 **13-point accuracy gain** over PP-OCRv4, achieving state-of-the-art performance across a variety of real-world scenarios. @@ -92,13 +103,13 @@ pip install paddleocr ### 3. Run inference by CLI ```bash # Run PP-OCRv5 inference -paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png +paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False # Run PP-StructureV3 inference -paddleocr PP-StructureV3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png +paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False # Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference -paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key +paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False # Get more information about "paddleocr ocr" paddleocr ocr --help @@ -107,13 +118,15 @@ paddleocr ocr --help ### 4. Run inference by API **4.1 PP-OCRv5 Example** ```python -from paddleocr import PaddleOCR - # Initialize PaddleOCR instance -ocr = PaddleOCR() +ocr = PaddleOCR( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False) # Run OCR inference on a sample image -result = ocr.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png") +result = ocr.predict( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png") # Visualize the results and save the JSON results for res in result: @@ -132,41 +145,17 @@ from paddleocr import PPStructureV3 pipeline = PPStructureV3() # For Image -output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png") +output = pipeline.predict( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png", + use_doc_orientation_classify=False, + use_doc_unwarping=False + ) # Visualize the results and save the JSON results for res in output: res.print() - res.save_to_json(save_path="output") - res.save_to_markdown(save_path="output") -# For PDF File -input_file = "./your_pdf_file.pdf" -output_path = Path("./output") - -output = pipeline.predict(input_file) - -markdown_list = [] -markdown_images = [] - -for res in output: - md_info = res.markdown - markdown_list.append(md_info) - markdown_images.append(md_info.get("markdown_images", {})) - -markdown_texts = pipeline.concatenate_markdown_pages(markdown_list) - -mkd_file_path = output_path / f"{Path(input_file).stem}.md" -mkd_file_path.parent.mkdir(parents=True, exist_ok=True) - -with open(mkd_file_path, "w", encoding="utf-8") as f: - f.write(markdown_texts) - -for item in markdown_images: - if item: - for path, image in item.items(): - file_path = output_path / path - file_path.parent.mkdir(parents=True, exist_ok=True) - image.save(file_path) + res.save_to_json(save_path="output") + res.save_to_markdown(save_path="output") ``` @@ -193,25 +182,37 @@ retriever_config = { "api_key": "api_key", # your api_key } -mllm_chat_bot_config = { - "module_name": "chat_bot", - "model_name": "PP-DocBee", - "base_url": "http://127.0.0.1:8080/", # your local mllm service url - "api_type": "openai", - "api_key": "api_key", # your api_key -} - -pipeline = PPChatOCRv4Doc() +pipeline = PPChatOCRv4Doc( + use_doc_orientation_classify=False, + use_doc_unwarping=False +) visual_predict_res = pipeline.visual_predict( input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png", - use_doc_orientation_classify=False, - use_doc_unwarping=False, use_common_ocr=True, use_seal_recognition=True, use_table_recognition=True, ) +mllm_predict_info = None +use_mllm = False +# If a multimodal large model is used, the local mllm service needs to be started. You can refer to the documentation: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.m d performs deployment and updates the mllm_chat_bot_config configuration. +if use_mllm: + mllm_chat_bot_config = { + "module_name": "chat_bot", + "model_name": "PP-DocBee", + "base_url": "http://127.0.0.1:8080/", # your local mllm service url + "api_type": "openai", + "api_key": "api_key", # your api_key + } + + mllm_predict_res = pipeline.mllm_pred( + input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png", + key_list=["驾驶室准乘人数"], + mllm_chat_bot_config=mllm_chat_bot_config, + ) + mllm_predict_info = mllm_predict_res["mllm_res"] + visual_info_list = [] for res in visual_predict_res: visual_info_list.append(res["visual_info"]) @@ -220,12 +221,6 @@ for res in visual_predict_res: vector_info = pipeline.build_vector( visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config ) -mllm_predict_res = pipeline.mllm_pred( - input="vehicle_certificate-1.png", - key_list=["驾驶室准乘人数"], - mllm_chat_bot_config=mllm_chat_bot_config, -) -mllm_predict_info = mllm_predict_res["mllm_res"] chat_result = pipeline.chat( key_list=["驾驶室准乘人数"], visual_info=visual_info_list, diff --git a/configs/det/PP-OCRv5/PP-OCRv5_server_det.yml b/configs/det/PP-OCRv5/PP-OCRv5_server_det.yml index d546fe2ac30ecf07b9aa2f518f54f365543112a3..7c88fc875408c8604af70a1b2bc6a9c05181bbb9 100644 --- a/configs/det/PP-OCRv5/PP-OCRv5_server_det.yml +++ b/configs/det/PP-OCRv5/PP-OCRv5_server_det.yml @@ -142,7 +142,6 @@ Eval: label_file_list: - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt transforms: - transforms: - DecodeImage: img_mode: BGR channel_first: false diff --git a/deploy/android_demo/gradlew b/deploy/android_demo/gradlew index cccdd3d517fc5249beaefa600691cf150f2fa3e6..dfc1f9b7a14c77e5270d6dcb74dc45ff8f4554c9 100644 --- a/deploy/android_demo/gradlew +++ b/deploy/android_demo/gradlew @@ -75,11 +75,9 @@ if [ -n "$JAVA_HOME" ] ; then JAVACMD="$JAVA_HOME/bin/java" fi if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME\n\nPlease set the JAVA_HOME variable in your environment to match the location of your Java installation." fi + else JAVACMD="java" which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. diff --git a/deploy/cpp_infer/CMakeLists.txt b/deploy/cpp_infer/CMakeLists.txt index f993f06563f455b3892cdf82d740f903ace3d68a..ce69d672ea49cfb7fa2d6b6f90861d3874af13f1 100644 --- a/deploy/cpp_infer/CMakeLists.txt +++ b/deploy/cpp_infer/CMakeLists.txt @@ -62,6 +62,7 @@ if (WIN32) if (WITH_STATIC_LIB) safe_set_static_flag() add_definitions(-DSTATIC_LIB) + add_definitions(-DYAML_CPP_STATIC_DEFINE) endif() message("cmake c debug flags " ${CMAKE_C_FLAGS_DEBUG}) message("cmake c release flags " ${CMAKE_C_FLAGS_RELEASE}) diff --git a/deploy/cpp_infer/include/ocr_cls.h b/deploy/cpp_infer/include/ocr_cls.h index d34da13fff73d66464808b1e3683d5164ef8b12d..cb8de8b91f3509cfd7324a3a4da171076e5138d2 100644 --- a/deploy/cpp_infer/include/ocr_cls.h +++ b/deploy/cpp_infer/include/ocr_cls.h @@ -55,7 +55,9 @@ public: if (config["Global"] && config["Global"]["model_name"]) { model_name = config["Global"]["model_name"].as(); } - if (!model_name.empty()) { + if (!model_name.empty() && + model_name != "PP-LCNet_x0_25_textline_ori" && + model_name != "PP-LCNet_x1_0_textline_ori") { std::cerr << "Error: " << model_name << " is currently not supported." << std::endl; std::exit(EXIT_FAILURE); diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index 59b0e0dc1004ee4c1cbb1df5624eb2154cff5f19..dba72d9a73c9212c965eb90e173ec9059495a6c1 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -66,7 +66,8 @@ public: if (config["Global"] && config["Global"]["model_name"]) { model_name = config["Global"]["model_name"].as(); } - if (!model_name.empty()) { + if (!model_name.empty() && model_name != "PP-OCRv5_mobile_det" && + model_name != "PP-OCRv5_server_det") { std::cerr << "Error: " << model_name << " is currently not supported." << std::endl; std::exit(EXIT_FAILURE); diff --git a/deploy/cpp_infer/include/ocr_rec.h b/deploy/cpp_infer/include/ocr_rec.h index 9f27a7fbfaebef9b0cd9147b570d2551025b1542..db8b1b38aa2c7428ba2a37fa874146bdd391fe9f 100644 --- a/deploy/cpp_infer/include/ocr_rec.h +++ b/deploy/cpp_infer/include/ocr_rec.h @@ -61,7 +61,8 @@ public: if (config["Global"] && config["Global"]["model_name"]) { model_name = config["Global"]["model_name"].as(); } - if (!model_name.empty()) { + if (!model_name.empty() && model_name != "PP-OCRv5_mobile_rec" && + model_name != "PP-OCRv5_server_rec") { std::cerr << "Error: " << model_name << " is currently not supported." << std::endl; std::exit(EXIT_FAILURE); diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp index 4b5d9c2286cfde042f22b0a182a058951ab609c1..1970ae6481c3b07426a0f7bfa877b312d1709f42 100644 --- a/deploy/cpp_infer/src/preprocess_op.cpp +++ b/deploy/cpp_infer/src/preprocess_op.cpp @@ -114,7 +114,7 @@ void CrnnResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio, cv::INTER_LINEAR); cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, int(imgW - resize_img.cols), cv::BORDER_CONSTANT, - {0, 0, 0}); + {128, 128, 128}); } void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, diff --git a/docs/FAQ.md b/docs/FAQ.md index b710ba63a11500dbbcac8d1b260189fd8fbdc8c9..89508e9d6d3caf5887f3744e68c75933dff677f9 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -427,11 +427,11 @@ checkpoints:指之前训练的中间结果,例如前一次训练到了100个 #### Q: PP-OCR系统中,文本检测的结果有置信度吗? -**A**:文本检测的结果有置信度,由于推理过程中没有使用,所以没有显示的返回到最终结果中。如果需要文本检测结果的置信度,可以在[文本检测DB的后处理代码](../../ppocr/postprocess/db_postprocess.py)的155行,添加scores信息。这样,在[检测预测代码](../../tools/infer/predict_det.py)的197行,就可以拿到文本检测的scores信息。 +**A**:文本检测的结果有置信度,由于推理过程中没有使用,所以没有显示的返回到最终结果中。如果需要文本检测结果的置信度,可以在[文本检测DB的后处理代码](../ppocr/postprocess/db_postprocess.py)的155行,添加scores信息。这样,在[检测预测代码](../tools/infer/predict_det.py)的197行,就可以拿到文本检测的scores信息。 #### Q: DB文本检测,特征提取网络金字塔构建的部分代码在哪儿? -**A**:特征提取网络金字塔构建的部分:[代码位置](../../ppocr/modeling/necks/db_fpn.py)。ppocr/modeling文件夹里面是组网相关的代码,其中architectures是文本检测或者文本识别整体流程代码;backbones是骨干网络相关代码;necks是类似与FPN的颈函数代码;heads是提取文本检测或者文本识别预测结果相关的头函数;transforms是类似于TPS特征预处理模块。更多的信息可以参考[代码组织结构](./tree.md)。 +**A**:特征提取网络金字塔构建的部分:[代码位置](../ppocr/modeling/necks/db_fpn.py)。ppocr/modeling文件夹里面是组网相关的代码,其中architectures是文本检测或者文本识别整体流程代码;backbones是骨干网络相关代码;necks是类似与FPN的颈函数代码;heads是提取文本检测或者文本识别预测结果相关的头函数;transforms是类似于TPS特征预处理模块。更多的信息可以参考[代码组织结构](./tree.md)。 #### Q:PaddleOCR如何做到横排和竖排同时支持的? diff --git a/docs/index.en.md b/docs/index.en.md index 22bb6f0185218db7e0db3edd9dcf29010e902ae8..84d4843f92267588901cd49c7cc6337a3f248788 100644 --- a/docs/index.en.md +++ b/docs/index.en.md @@ -4,54 +4,40 @@ hide: - navigation - toc --- -
-

- - PaddleOCR Banner -

-
+ + + +![PaddleOCR Banner](./images/Banner.png) Since its initial release, PaddleOCR has gained widespread acclaim across academia, industry, and research communities, thanks to its cutting-edge algorithms and proven performance in real-world applications. It’s already powering popular open-source projects like Umi-OCR, OmniParser, MinerU, and RAGFlow, making it the go-to OCR toolkit for developers worldwide. -On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the [PaddlePaddle 3.0](https://github.com/PaddlePaddle/Paddle) framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5T**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for domestic hardware platforms such as **KUNLUNXIN** and **Ascend**. +On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the [PaddlePaddle 3.0](https://github.com/PaddlePaddle/Paddle) framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5 Turbo**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for domestic hardware platforms such as **KUNLUNXIN** and **Ascend**. Three Major New Features in PaddleOCR 3.0: -- 🖼️ Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. +- 🖼️ Universal-Scene Text Recognition Model [PP-OCRv5](version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation.[Online Demo](https://aistudio.baidu.com/community/app/91660/webUI) -- 🧮 General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. +- 🧮 General Document-Parsing Solution [PP-StructureV3](./version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. [Online Demo](https://aistudio.baidu.com/community/app/518494/webUI) -- 📈 Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the WenXin large model 4.5T, achieving 15.7 percentage points higher accuracy than its predecessor. +- 📈 Intelligent Document-Understanding Solution [PP-ChatOCRv4](./version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the ERNIE 4.5 Turbo, achieving 15 percentage points higher accuracy than its predecessor.[Online Demo](https://aistudio.baidu.com/community/app/518493/webUI) In addition to providing an outstanding model library, PaddleOCR 3.0 also offers user-friendly tools covering model training, inference, and service deployment, so developers can rapidly bring AI applications to production. -
-

- - PaddleOCR Architecture -

-
-You can [Quick Start](#-quick-start) directly, find comprehensive documentation in the [PaddleOCR Docs](https://paddlepaddle.github.io/PaddleOCR/main/index.html), get support via [Github Issus](https://github.com/PaddlePaddle/PaddleOCR/issues), and explore our OCR courses on [OCR courses on AIStudio](https://aistudio.baidu.com/course/introduce/25207). +![PaddleOCR Arch](./images/Arch.png) + +You can [Quick Start](./quick_start.en.md) directly, find comprehensive documentation in the [PaddleOCR Docs](https://paddlepaddle.github.io/PaddleOCR/main/index.html), get support via [Github Issus](https://github.com/PaddlePaddle/PaddleOCR/issues), and explore our OCR courses on [OCR courses on AIStudio](https://aistudio.baidu.com/course/introduce/25207). ## 🔄 Quick Overview of Execution Results -
-

- - PP-OCRv5 Demo -

-
-
-

- - PP-StructureV3 Demo -

-
+![PP-OCRv5 Demo](./images/demo.gif) + +![PP-StructureV3 Demo](./images/blue_v3.gif) + ## 👩‍👩‍👧‍👦 Community * 👫 Join the [PaddlePaddle Community](https://github.com/PaddlePaddle/community), where you can engage with [paddlepaddle developers](https://www.paddlepaddle.org.cn/developercommunity), researchers, and enthusiasts from around the world. * 🎓 Learn from experts through workshops, tutorials, and Q&A sessions [hosted by the AI Studio](https://aistudio.baidu.com/learn/center). * 🏆 Participate in [hackathons, challenges, and competitions](https://aistudio.baidu.com/competition) to showcase your skills and win exciting prizes. -* 📣 Stay updated with the latest news, announcements, and events by following our [Twitter](https://x.com/PaddlePaddle) and [WeChat](https://mp.weixin.qq.com/s/MAdo7fZ6dfeGcCQUtRP2ag). +* 📣 Stay updated with the latest news, announcements, and events by following our [Twitter](https://x.com/PaddlePaddle) and [WeChat](https://mp.weixin.qq.com/s/vYj1ZDcAfJ1lu_DzlOKgtQ)). diff --git a/docs/index.md b/docs/index.md index f7b30b297ccdcf18742012cd013d41a06e555397..72b15c51edfb94aa5bb868ebf508c8b2a7a815cf 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,7 +7,7 @@ hide: @@ -18,35 +18,36 @@ PaddleOCR自发布以来凭借学术前沿算法和产业落地实践,受到 PaddleOCR 3.0**新增**三大特色能力: -- 🖼️全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。 +- 🖼️全场景文字识别模型[PP-OCRv5](./version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。[在线体验](https://aistudio.baidu.com/community/app/91660/webUI) -- 🧮通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。 +- 🧮通用文档解析方案[PP-StructureV3](./version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。[在线体验](https://aistudio.baidu.com/community/app/518494/webUI) + +- 📈智能文档理解方案[PP-ChatOCRv4](./version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支持文心大模型4.5 Turbo,精度相比上一代**提升15个百分点**。[在线体验](https://aistudio.baidu.com/community/app/518493/webUI) -- 📈智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支持文心大模型4.5 Turbo,精度相比上一代**提升15.7个百分点**。 PaddleOCR 3.0除了提供优秀的模型库外,还提供好学易用的工具,覆盖模型训练、推理和服务化部署,方便开发者快速落地AI应用。 -您可直接[快速开始](#-快速开始),或查阅完整的 [PaddleOCR 文档](https://paddlepaddle.github.io/PaddleOCR/main/index.html),或通过 [Github Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) 获取支持,或在 [AIStudio 课程平台](https://aistudio.baidu.com/course/introduce/25207) 探索我们的 OCR 课程。 +您可直接[快速开始](./quick_start.md),或查阅完整的 [PaddleOCR 文档](https://paddlepaddle.github.io/PaddleOCR/main/index.html),或通过 [Github Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) 获取支持,或在 [AIStudio 课程平台](https://aistudio.baidu.com/course/introduce/25207) 探索我们的 OCR 课程。 ## 🔄 快速一览运行效果 @@ -56,4 +57,4 @@ PaddleOCR 3.0除了提供优秀的模型库外,还提供好学易用的工具 * 👫 加入 [PaddlePaddle 开发者社区](https://github.com/PaddlePaddle/community),与全球开发者、研究人员互动交流 * 🎓 通过 AI Studio 的 [技术研讨会](https://aistudio.baidu.com/learn/center) 学习前沿技术 * 🏆 参与 [黑客马拉松](https://aistudio.baidu.com/competition) 展示才能,赢取奖励 -* 📣 关注 [微信公众号](https://mp.weixin.qq.com/s/MAdo7fZ6dfeGcCQUtRP2ag) 获取最新动态 +* 📣 关注 [微信公众号](https://mp.weixin.qq.com/s/vYj1ZDcAfJ1lu_DzlOKgtQ) 获取最新动态 diff --git a/docs/quick_start.en.md b/docs/quick_start.en.md index fc13221eb088cda9a8a5df4f8aabe4c8ba15ed3f..bb248bf87260950ce83cdfb1f6fc18e84a4053fe 100644 --- a/docs/quick_start.en.md +++ b/docs/quick_start.en.md @@ -28,40 +28,49 @@ pip install paddleocr ### Command Line Usage -=== "OCR Pipeline" +=== "PP-OCRv5" ```bash linenums="1" - paddleocr ocr -i ./general_ocr_002.png + paddleocr ocr -i ./general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False ``` -=== "Text Detection Module" +=== "PP-OCRv5 Text Detection Module" ```bash linenums="1" paddleocr text_detection -i ./general_ocr_001.png ``` -=== "Text Recognition Module" +=== "PP-OCRv5 Text Recognition Module" ```bash linenums="1" paddleocr text_recognition -i ./general_ocr_rec_001.png ``` -=== "PP-StructureV3 Pipeline" +=== "PP-StructureV3" ```bash linenums="1" - paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png + paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False ``` ### Python Script Usage -=== "OCR Pipeline" +=== "PP-OCRv5" ```python linenums="1" from paddleocr import PaddleOCR - ocr = PaddleOCR() # text image preprocessing + text detection + text orientation classification + text recognition - # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False) # text detection + text orientation classification + text recognition - # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False) # text detection + text recognition + ocr = PaddleOCR( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False) # text detection + text recognition + # ocr = PaddleOCR(use_doc_orientation_classify=True, use_doc_unwarping=True) # text image preprocessing + text detection + textline orientation classification + text recognition + # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False) # text detection + textline orientation classification + text recognition + # ocr = PaddleOCR( + # text_detection_model_name="PP-OCRv5_mobile_det", + # text_recognition_model_name="PP-OCRv5_mobile_rec", + # use_doc_orientation_classify=False, + # use_doc_unwarping=False, + # use_textline_orientation=False) # Switch to PP-OCRv5_mobile models result = ocr.predict("./general_ocr_002.png") for res in result: res.print() @@ -93,7 +102,7 @@ pip install paddleocr [ 99, ..., 480]], dtype=int16)}} ``` -=== "Text Detection Module" +=== "PP-OCRv5 Text Detection Module" ```python linenums="1" from paddleocr import TextDetection @@ -118,7 +127,7 @@ pip install paddleocr [ 36, 456]]], dtype=int16), 'dt_scores': [0.8562385635646694, 0.8818259002228059, 0.8406072284043453, 0.8855339313157491]}} ``` -=== "Text Recognition Module" +=== "PP-OCRv5 Text Recognition Module" ```python linenums="1" from paddleocr import TextRecognition @@ -137,18 +146,17 @@ pip install paddleocr {'res': {'input_path': 'general_ocr_rec_001.png', 'page_index': None, 'rec_text': '绿洲仕格维花园公寓', 'rec_score': 0.990813672542572}} ``` -=== "PP-StructureV3 Pipeline" +=== "PP-StructureV3" ```python linenums="1" from paddleocr import PPStructureV3 - pipeline = PPStructureV3() - output = pipeline.predict( - input="./pp_structure_v3_demo.png", + pipeline = PPStructureV3( use_doc_orientation_classify=False, - use_doc_unwarping=False, - use_textline_orientation=False, + use_doc_unwarping=False ) + output = pipeline.predict( + input="./pp_structure_v3_demo.png") for res in output: res.print() res.save_to_json(save_path="output") diff --git a/docs/quick_start.md b/docs/quick_start.md index 6031c6805f5843badb40ccb3a751a5fe8039a9b4..4f8116c3015cd983437caeba56c420522a3f0016 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -28,40 +28,49 @@ pip install paddleocr ### 命令行使用 -=== "OCR产线" +=== "PP-OCRv5" ```bash linenums="1" - paddleocr ocr -i ./general_ocr_002.png + paddleocr ocr -i ./general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False ``` -=== "文本检测模块" +=== "PP-OCRv5文本检测模块" ```bash linenums="1" - paddleocr text_detection -i ./general_ocr_001.png + paddleocr text_detection -i ./general_ocr_001.png ``` -=== "文本识别模块" +=== "PP-OCRv5文本识别模块" ```bash linenums="1" paddleocr text_recognition -i ./general_ocr_rec_001.png ``` -=== "PP-StructureV3产线" +=== "PP-StructureV3" ```bash linenums="1" - paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png + paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False ``` ### Python脚本使用 -=== "OCR产线" +=== "PP-OCRv5" ```python linenums="1" from paddleocr import PaddleOCR - ocr = PaddleOCR() # 文本图像预处理+文本检测+方向分类+文本识别 - # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False) # 文本检测+方向分类+文本识别 - # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False) # 文本检测+文本识别 + ocr = PaddleOCR( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False) # 文本检测+文本识别 + # ocr = PaddleOCR(use_doc_orientation_classify=True, use_doc_unwarping=True) # 文本图像预处理+文本检测+方向分类+文本识别 + # ocr = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False) # 文本检测+文本行方向分类+文本识别 + # ocr = PaddleOCR( + # text_detection_model_name="PP-OCRv5_mobile_det", + # text_recognition_model_name="PP-OCRv5_mobile_rec", + # use_doc_orientation_classify=False, + # use_doc_unwarping=False, + # use_textline_orientation=False) # 更换 PP-OCRv5_mobile 模型 result = ocr.predict("./general_ocr_002.png") for res in result: res.print() @@ -72,28 +81,28 @@ pip install paddleocr 输出示例: ```bash - {'res': {'input_path': './general_ocr_002.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[ 1, 4], + {'res': {'input_path': '/root/.paddlex/predict_input/general_ocr_002.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[ 3, 10], + ..., + [ 4, 30]], + ..., - [ 1, 33]], - ..., + [[ 99, 456], + ..., + [ 99, 479]]], dtype=int16), 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['www.997700', '', 'Cm', '登机牌', 'BOARDING', 'PASS', 'CLASS', '序号SERIAL NO.', '座位号', 'SEAT NO.', '航班FLIGHT', '日期DATE', '舱位', '', 'W', '035', '12F', 'MU2379', '03DEc', '始发地', 'FROM', '登机口', 'GATE', '登机时间BDT', '目的地TO', '福州', 'TAIYUAN', 'G11', 'FUZHOU', '身份识别IDNO.', '姓名NAME', 'ZHANGQIWEI', '票号TKT NO.', '张祺伟', '票价FARE', 'ETKT7813699238489/1', '登机口于起飞前10分钟关闭 GATESCL0SE10MINUTESBEFOREDEPARTURETIME'], 'rec_scores': array([0.67582953, ..., 0.97418666]), 'rec_polys': array([[[ 3, 10], + ..., + [ 4, 30]], - [[ 99, 455], - ..., - [ 99, 480]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['www.997788.com', '登机牌', 'BOARDING PASS', '舱位CLASS', '序号 SERIAL NO.', '座位号', 'SEAT NO', '航班FLIGHT', '日期', 'DATE', 'MU 2379', '03DEC', 'W', '035', '', '始发地', 'FROM', '登机口', 'GATE', '登机时间BDT', '目的地TO', '福州', 'TAIYUAN', 'G11', 'FUZHOU', '身份识别IDNO.', '姓名NAME', 'ZHANGQIWEI', '票号TKTNO.', '张祺伟', '票价FARE', 'ETKT7813699238489/1', '登机口于起飞前10分钟关闭 GATESCL0SE10MINUTESBEFOREDEPARTURETIME'], 'rec_scores': array([0.99684608, ..., 0.97179604]), 'rec_polys': array([[[ 1, 4], ..., - [ 1, 33]], - ..., - - [[ 99, 455], + [[ 99, 456], + ..., + [ 99, 479]]], dtype=int16), 'rec_boxes': array([[ 3, ..., 30], ..., - [ 99, 480]]], dtype=int16), 'rec_boxes': array([[ 1, ..., 33], - ..., - [ 99, ..., 480]], dtype=int16)}} + [ 99, ..., 479]], dtype=int16)}} ``` -=== "文本检测模块" +=== "PP-OCRv5文本检测模块" ```python linenums="1" from paddleocr import TextDetection @@ -118,7 +127,7 @@ pip install paddleocr [ 36, 456]]], dtype=int16), 'dt_scores': [0.8562385635646694, 0.8818259002228059, 0.8406072284043453, 0.8855339313157491]}} ``` -=== "文本识别模块" +=== "PP-OCRv5文本识别模块" ```python linenums="1" from paddleocr import TextRecognition @@ -137,17 +146,16 @@ pip install paddleocr {'res': {'input_path': 'general_ocr_rec_001.png', 'page_index': None, 'rec_text': '绿洲仕格维花园公寓', 'rec_score': 0.990813672542572}} ``` -=== "PP-StructureV3产线" +=== "PP-StructureV3" ```python linenums="1" from paddleocr import PPStructureV3 - pipeline = PPStructureV3() - output = pipeline.predict( - input="./pp_structure_v3_demo.png", + pipeline = PPStructureV3( use_doc_orientation_classify=False, - use_doc_unwarping=False, - use_textline_orientation=False, + use_doc_unwarping=False) + output = pipeline.predict( + input="./pp_structure_v3_demo.png", ) for res in output: res.print() diff --git a/docs/update/update.en.md b/docs/update/update.en.md index 3525153e4bfca7545ac3561b99f1ea96e80b83c0..f3aed26dbe080ca727c23152a380323301e7dc88 100644 --- a/docs/update/update.en.md +++ b/docs/update/update.en.md @@ -7,6 +7,54 @@ hide: ### Recently Update +#### **🔥🔥 2025.06.05: Release of PaddleOCR v3.0.1, which includes:** + +- **Optimisation of certain models and model configurations:** + - Updated the default model configuration for PP-OCRv5, changing both detection and recognition from mobile to server models. To improve default performance in most scenarios, the parameter `limit_side_len` in the configuration has been changed from 736 to 64. + - Added a new text line orientation classification model `PP-LCNet_x1_0_textline_ori` with an accuracy of 99.42%. The default text line orientation classifier for OCR, PP-StructureV3, and PP-ChatOCRv4 pipelines has been updated to this model. + - Optimised the text line orientation classification model `PP-LCNet_x0_25_textline_ori`, improving accuracy by 3.3 percentage points to a current accuracy of 98.85%. + +- **Optimisation of issues present in version 3.0.0:** + - **Improved CLI usage experience:** When using the PaddleOCR CLI without passing any parameters, a usage prompt is now provided. + - **New parameters added:** PP-ChatOCRv3 and PP-StructureV3 now support the `use_textline_orientation` parameter. + - **CPU inference speed optimisation:** All pipeline CPU inferences now enable MKL-DNN by default. + - **Support for C++ inference:** The detection and recognition concatenation part of PP-OCRv5 now supports C++ inference. + +- **Fixes for issues present in version 3.0.0:** + - Fixed an issue where PP-StructureV3 encountered CPU inference errors due to the inability to use MKL-DNN with formula and table recognition models. + - Fixed an issue where GPU environments encountered the error `FatalError: Process abort signal is detected by the operating system` during inference. + - Fixed type hint issues in some Python 3.8 environments. + - Fixed the issue where the method `PPStructureV3.concatenate_markdown_pages` was missing. + - Fixed an issue where specifying both `lang` and `model_name` when instantiating `paddleocr.PaddleOCR` resulted in `model_name` being ineffective. + +#### **🔥🔥 2025.05.20: PaddleOCR 3.0 Official Release Highlights** + +- **PP-OCRv5: All-Scene Text Recognition Model** + - Supports five text types and complex handwriting in a single model. + - Achieves a 13% accuracy improvement over the previous generation. + +- **PP-StructureV3: General Document Parsing Solution** + - Offers high-precision parsing for multi-scene, multi-layout PDFs. + - Outperforms numerous open and closed-source solutions in public benchmarks. + +- **PP-ChatOCRv4: Intelligent Document Understanding Solution** + - Natively supports ERNIE 4.5 Turbo. + - Delivers a 15% accuracy boost over the previous version. + +- **Rebuilt Deployment Capabilities with Unified Inference Interface:** + - Integrates PaddleX3.0's core features for a comprehensive upgrade of the inference and deployment modules. + - Optimizes the design from version 2.x and unifies the Python API and CLI. + - Supports high-performance inference, serving, and edge deployment scenarios. + +- **Optimized Training with PaddlePaddle Framework 3.0:** + - Compatible with the latest features such as the CINN compiler. + - Inference model files now use `xxx.json` instead of `xxx.pdmodel`. + +- **Unified Model Naming:** + - Updated naming conventions for models supported by PaddleOCR 3.0 for consistency and easier maintenance. + +- For more details, check out the [Upgrade Notes from 2.x to 3.x](./upgrade_notes.en.md). + #### **🔥🔥2025.3.7 release PaddleOCR v2.10, including**: - **12 new self-developed single models:** diff --git a/docs/update/update.md b/docs/update/update.md index e414e4b5f984e0476fcceaf694c35a7ff974341a..9f9d60dba8f9a09c52bd8889a7f80e32b1e46d8f 100644 --- a/docs/update/update.md +++ b/docs/update/update.md @@ -7,6 +7,39 @@ hide: ### 更新 +#### **🔥🔥2025.06.05: PaddleOCR v3.0.1 版本发布,包含:** +- **优化部分模型和模型配置:** + - 更新 PP-OCRv5默认模型配置,检测和识别均由mobile改为server模型。为了改善大多数的场景默认效果,配置中的参数`limit_side_len`由736改为64 + - 新增文本行方向分类`PP-LCNet_x1_0_textline_ori`模型,精度99.42%,OCR、PP-StructureV3、PP-ChatOCRv4产线的默认文本行方向分类器改为该模型 + - 优化文本行方向分类`PP-LCNet_x0_25_textline_ori`模型,精度提升3.3个百分点,当前精度98.85% +- **优化3.0.0版本部分存在的问题** + - **优化CLI使用体验:** 当使用PaddleOCR CLI不传入任何参数时,给出用法提示。 + - **新增参数:** PP-ChatOCRv3、PP-StructureV3支持`use_textline_orientation`参数。 + - **CPU推理速度优化:** 所有产线CPU推理默认开启MKL-DNN。 + - **C++推理支持:** PP-OCRv5的检测和识别串联部分支持C++推理 +- **修复3.0.0版本部分存在的问题** + - 修复由于公式识别、表格识别模型无法使用MKL-DNN导致PP-StructureV3在部分cpu推理报错的问题 + - 修复在部分GPU环境中推理报`FatalError: Process abort signal is detected by the operating system`错误的问题 + - 修复部分Python3.8环境的type hint的问题 + - 修复`PPStructureV3.concatenate_markdown_pages`方法不存在的问题。 + - 修复实例化`paddleocr.PaddleOCR`时同时指定`lang`和`model_name`时`model_name`不生效的问题。 + +#### **🔥🔥2025.05.20: PaddleOCR 3.0 正式发布,包含:** + +- **发布全场景文字识别模型PP-OCRv5:** 单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代提升13个百分点。 + +- **发布通用文档解析方案PP-StructureV3:** 支持多场景、多版式 PDF 高精度解析,在公开评测集中领先众多开源和闭源方案。 + +- **发布智能文档理解方案PP-ChatOCRv4:** 原生支持文心大模型4.5 Turbo,精度相比上一代提升15个百分点。 + +- **重构部署能力,统一推理接口:** PaddleOCR 3.0 融合了飞桨 PaddleX3.0 工具的底层能力,全面升级推理、部署模块,优化 2.x 版本的设计,统一并优化了 Python API 和命令行接口(CLI)。部署能力现覆盖高性能推理、服务化部署及端侧部署三大场景。 + +- **适配飞桨框架 3.0,优化训练流程:** 新版本已兼容飞桨 3.0 的 CINN 编译器等最新特性,静态图模型存储文件名由 `xxx.pdmodel` 改为 `xxx.json`。 + +- **统一模型名称:** 对PaddleOCR3.0支持的模型命名体系进行了更新,采用更规范、统一的命名规则,为后续迭代与维护奠定基础。 + +- [2.x升级3.x其他说明](./upgrade_notes.md)。 + #### **🔥🔥2025.3.7 PaddleOCR 2.10 版本,主要包含如下内容** - **重磅新增 OCR 领域 12 个自研单模型:** diff --git a/docs/update/upgrade_notes.en.md b/docs/update/upgrade_notes.en.md index d6d2a1f11d5884bb128f07b4b13a5a3386a2ce51..6e746f0d518d18e3d5616dbd509a02cd6183576f 100644 --- a/docs/update/upgrade_notes.en.md +++ b/docs/update/upgrade_notes.en.md @@ -17,10 +17,10 @@ Given this background, we’ve decided to implement a major, non-backward-compat The 3.x upgrade consists of three major enhancements: 1. **New Model Pipelines**: Introduced several new pipelines such as PP-OCRv5, PP-StructureV3, and PP-ChatOCR v4, covering a wide range of base models. These significantly enhance recognition capabilities for various text types, including handwriting, to meet the growing demand for high-precision parsing in complex documents. All models are ready-to-use out of the box, improving development efficiency. -2. **Refactored Deployment and Unified Inference Interface**: The deployment module in PaddleOCR 3.x is rebuilt using PaddleX’s underlying capabilities, fixing design flaws from 2.x and unifying both Python APIs and CLI interfaces. The deployment now supports three main scenarios: high-performance inference, service-oriented deployment, and edge deployment. +2. **Refactored Deployment and Unified Inference Interface**: The deployment module in PaddleOCR 3.x is rebuilt using [PaddleX](../version3.x/paddleocr_and_paddlex.en.md)’s underlying capabilities, fixing design flaws from 2.x and unifying both Python APIs and CLI interfaces. The deployment now supports three main scenarios: high-performance inference, service-oriented deployment, and edge deployment. 3. **PaddlePaddle 3.0 Compatibility and Optimized Training**: The new version is fully compatible with PaddlePaddle 3.0, including features like the CINN compiler. It also introduces a standardized model naming system to streamline future updates and maintenance. -Some legacy features from PaddleOCR 2.x remain partially supported in 3.x. For more information, refer to [Legacy Features](version2.x/legacy/index.en.md). +Some legacy features from PaddleOCR 2.x remain partially supported in 3.x. For more information, refer to [Legacy Features](../version2.x/legacy/index.en.md). ## 3. Migrating Inference Code from PaddleOCR 2.x to 3.x @@ -68,7 +68,7 @@ res.save_to_img("result") It’s worth noting that the `PPStructure` module in PaddleOCR 2.x has been removed in 3.x. We recommend switching to `PPStructureV3`, which offers richer functionality and better parsing results. Refer to the relevant documentation for usage details. -Also, in 2.x, the `show_log` parameter could be passed when creating a `PaddleOCR` object to control logging. However, this design affected all `PaddleOCR` instances due to the use of a shared logger—clearly not the expected behavior. PaddleOCR 3.x introduces a brand-new logging system to address this issue. For more details, see [Logging](version3.x/logging.en.md). +Also, in 2.x, the `show_log` parameter could be passed when creating a `PaddleOCR` object to control logging. However, this design affected all `PaddleOCR` instances due to the use of a shared logger—clearly not the expected behavior. PaddleOCR 3.x introduces a brand-new logging system to address this issue. For more details, see [Logging](../version3.x/logging.en.md). ## 4. Known Issues in PaddleOCR 3.0 diff --git a/docs/update/upgrade_notes.md b/docs/update/upgrade_notes.md index 840134d67d24de00b492481355ce9f5a62971766..b0abee9d5a7ace66e218ca78f7873867d500d1b3 100644 --- a/docs/update/upgrade_notes.md +++ b/docs/update/upgrade_notes.md @@ -17,10 +17,10 @@ 本次升级内容主要可分为三个部分: 1. **新增多条模型产线**:推出 PP-OCRv5、PP-StructureV3、PP-ChatOCR v4 等多条模型产线,并补充覆盖多种方向的基础模型,重点增强了多文字类型识别、手写体识别等能力,满足大模型应用对复杂文档高精度解析的旺盛需求。用户可直接开箱使用,提升开发效率。 -2. **重构部署能力,统一推理接口**:PaddleOCR 3.x 融合了飞桨 [PaddleX](version3.x/paddleocr_and_paddlex.md) 工具的底层能力,全面升级推理、部署模块,修正 2.x 版本中的设计错误,统一并优化了 Python API 和命令行接口(CLI)。部署能力现覆盖高性能推理、服务化部署及端侧部署三大场景。 +2. **重构部署能力,统一推理接口**:PaddleOCR 3.x 融合了飞桨 [PaddleX](../version3.x/paddleocr_and_paddlex.md) 工具的底层能力,全面升级推理、部署模块,修正 2.x 版本中的设计错误,统一并优化了 Python API 和命令行接口(CLI)。部署能力现覆盖高性能推理、服务化部署及端侧部署三大场景。 3. **适配飞桨 3.0,优化训练流程**:新版本已兼容飞桨 3.0 的 CINN 编译器等最新特性,并对模型命名体系进行了更新,采用更规范、统一的命名规则,为后续迭代与维护奠定基础。 -对于 PaddleOCR 2.x 中的部分历史遗留功能,PaddleOCR 3.x 目前仍提供了一定程度的兼容支持。详情请参阅 [历史遗留功能](version2.x/legacy/index.md)。 +对于 PaddleOCR 2.x 中的部分历史遗留功能,PaddleOCR 3.x 目前仍提供了一定程度的兼容支持。详情请参阅 [历史遗留功能](../version2.x/legacy/index.md)。 ## 3. 将 PaddleOCR 2.x 的推理代码移到 PaddleOCR 3.x @@ -56,7 +56,7 @@ from paddleocr import PaddleOCR # 基础的初始化参数保持一致 ocr = PaddleOCR(lang="en") result = ocr.ocr("img.png") -# 也可以使用新的统接口 +# 也可以使用新的统一接口 # result = ocr.predict("img.png") for res in result: # 可直接调用方法打印识别结果,无需嵌套循环 @@ -68,7 +68,7 @@ res.save_to_img("result") 需要特别指出的是,PaddleOCR 2.x 提供的 `PPStructure` 在 PaddleOCR 3.x 中已被移除。建议使用功能更丰富、解析效果更好的 `PPStructureV3` 替代,并参考相关文档了解新接口的用法。 -此外,在 PaddleOCR 2.x 中,可以通过在构造 `PaddleOCR` 对象时传入 `show_log` 参数来控制日志输出。然而,这种设计存在局限:由于所有 `PaddleOCR` 实例共享一个日志器,当一个实例设置了日志行为后,其它实例也会受到影响,这显然不符合预期。为了解决这一问题,PaddleOCR 3.x 引入了全新的日志系统。详细内容请参阅 [日志](version3.x/logging.md)。 +此外,在 PaddleOCR 2.x 中,可以通过在构造 `PaddleOCR` 对象时传入 `show_log` 参数来控制日志输出。然而,这种设计存在局限:由于所有 `PaddleOCR` 实例共享一个日志器,当一个实例设置了日志行为后,其它实例也会受到影响,这显然不符合预期。为了解决这一问题,PaddleOCR 3.x 引入了全新的日志系统。详细内容请参阅 [日志](../version3.x/logging.md)。 ## 4. PaddleOCR 3.0 已知问题 diff --git a/docs/version2.x/README.en.md b/docs/version2.x/README.en.md new file mode 100644 index 0000000000000000000000000000000000000000..3c062c7e5191fb6188f1748b87a09a8e78d4762a --- /dev/null +++ b/docs/version2.x/README.en.md @@ -0,0 +1,3 @@ +# Features of the 2.x Branch + +Due to the upgrade of the 3.x branch, the wheel package has undergone refactoring, resulting in some models and features no longer being compatible with the older branch. To ensure that users relying on the 2.x branch's features can continue using them, we have retained the code related to the 2.x branch in this directory. diff --git a/docs/version2.x/README.md b/docs/version2.x/README.md new file mode 100644 index 0000000000000000000000000000000000000000..32d261a3ba616a9308ef0f12d15d2d2304fd6b8b --- /dev/null +++ b/docs/version2.x/README.md @@ -0,0 +1,3 @@ +# 关于 2.x 分支的功能 + +由于 3.x 分支的升级,wheel 包经历了重构,导致部分模型和功能与旧版分支不再兼容。为了使得依赖于 2.x 分支功能的用户能够继续使用这些功能,我们在此目录中保留了与 2.x 分支相关的代码。 diff --git a/docs/version2.x/legacy/model_list_2.x.en.md b/docs/version2.x/legacy/model_list_2.x.en.md new file mode 100644 index 0000000000000000000000000000000000000000..7c455879b4fffe1359982524137fe5cedfbb3bf4 --- /dev/null +++ b/docs/version2.x/legacy/model_list_2.x.en.md @@ -0,0 +1,106 @@ +# Models Supported by PaddleOCR 2.x and Earlier Versions + +Due to differences in the concatenation logic and configurations used during model training and inference, the PP-OCRv4 and PP-OCRv3 series models from the PaddleOCR 2.x branch cannot be used interchangeably with those from the PaddleOCR 3.0 and later branches. + +## Detection Models + +### Chinese Detection Models + +* ch_PP-OCRv4_det +* ch_PP-OCRv4_server_det +* ch_PP-OCRv3_det_slim +* ch_PP-OCRv3_det +* ch_PP-OCRv2_det_slim +* ch_PP-OCRv2_det +* ch_ppocr_mobile_slim_v2.0_det +* ch_ppocr_mobile_v2.0_det +* ch_ppocr_server_v2.0_det + +### English Detection Models + +* en_PP-OCRv3_det_slim +* en_PP-OCRv3_det + +### Multilingual Detection Models + +* ml_PP-OCRv3_det_slim +* ml_PP-OCRv3_det + +## Recognition Models + +### Chinese Recognition Models + +* ch_PP-OCRv4_rec +* ch_PP-OCRv4_server_rec +* ch_PP-OCRv4_server_rec_doc +* ch_PP-OCRv3_rec_slim +* ch_PP-OCRv3_rec +* ch_PP-OCRv2_rec_slim +* ch_PP-OCRv2_rec +* ch_ppocr_mobile_slim_v2.0_rec +* ch_ppocr_mobile_v2.0_rec +* ch_ppocr_server_v2.0_rec +* SVTRv2(Rec Sever) +* RepSVTR(Mobile) + +### English Recognition Models + +* en_PP-OCRv4_rec +* en_PP-OCRv3_rec_slim +* en_PP-OCRv3_rec +* en_number_mobile_slim_v2.0_rec +* en_number_mobile_v2.0_rec + +### Multilingual Recognition Models + +* korean_PP-OCRv3_rec +* japan_PP-OCRv3_rec +* chinese_cht_PP-OCRv3_rec +* te_PP-OCRv3_rec +* ka_PP-OCRv3_rec +* ta_PP-OCRv3_rec +* latin_PP-OCRv3_rec +* arabic_PP-OCRv3_rec +* cyrillic_PP-OCRv3_rec +* devanagari_PP-OCRv3_rec + +## End-to-End OCR Models + +* PGNet + +## Text Direction Classification Models + +* ch_ppocr_mobile_slim_v2.0_cls +* ch_ppocr_mobile_v2.0_cls + +## Formula Recognition Models + +* CAN +* UniMERNet +* LaTeX-OCR +* PP-FormulaNet-S +* PP-FormulaNet-L + +## Table Recognition Models + +* TableMaster +* SLANet +* SLANeXt_wired +* SLANeXt_wireless +* en_ppocr_mobile_v2.0_table_structure +* en_ppstructure_mobile_v2.0_SLANet +* ch_ppstructure_mobile_v2.0_SLANet + +## Table OCR Models + +* en_ppocr_mobile_v2.0_table_det +* en_ppocr_mobile_v2.0_table_rec + +## Layout Detection Models + +* picodet_lcnet_x1_0_fgd_layout +* ppyolov2_r50vd_dcn_365e_publaynet +* picodet_lcnet_x1_0_fgd_layout_cdla +* picodet_lcnet_x1_0_fgd_layout_table +* ppyolov2_r50vd_dcn_365e_tableBank_word +* ppyolov2_r50vd_dcn_365e_tableBank_latex diff --git a/docs/version2.x/legacy/model_list_2.x.md b/docs/version2.x/legacy/model_list_2.x.md index 090be33c0272b209f15bedebdccab2f6094b2726..0324f102d4cd2a560101b37b780c879f1504a740 100644 --- a/docs/version2.x/legacy/model_list_2.x.md +++ b/docs/version2.x/legacy/model_list_2.x.md @@ -1,6 +1,6 @@ # PaddleOCR 2.x及更低版本支持的模型 -由于串联逻辑和模型训练、推理时用到的配置不同,PaddleOCR 2.x 版本的 PP-OCRv4、PP-OCRv3 系列模型与 PaddleOCR 3.0 及以上版本的 PP-OCRv4、PP-OCRv3 系列模型不能互相使用。 +由于串联逻辑和模型训练、推理时用到的配置不同,PaddleOCR 2.x 分支的 PP-OCRv4、PP-OCRv3 系列模型与 PaddleOCR 3.0 及以上分支的 PP-OCRv4、PP-OCRv3 系列模型不能互相使用。 ## 检测模型 diff --git a/docs/version2.x/ppocr/blog/multi_languages.en.md b/docs/version2.x/ppocr/blog/multi_languages.en.md index 39cb4a2b8ab3a3d184c5f4f842cfdab6b9e98519..23c3d2afa01e6f7ff2d2dbeed0ba16c08d2b32dc 100644 --- a/docs/version2.x/ppocr/blog/multi_languages.en.md +++ b/docs/version2.x/ppocr/blog/multi_languages.en.md @@ -38,16 +38,16 @@ This document will briefly introduce how to use the multilingual model. ```bash linenums="1" # cpu -pip install paddlepaddle +pip install "paddlepaddle<=2.6" # gpu -pip install paddlepaddle-gpu +pip install "paddlepaddle-gpu<=2.6" ``` ### 1.2 PaddleOCR package installation ```bash linenums="1" -pip install paddleocr +pip install "paddleocr<3.0" ``` Build and install locally diff --git a/docs/version2.x/ppocr/blog/multi_languages.md b/docs/version2.x/ppocr/blog/multi_languages.md index c0852eea6af1c1184f9dc0190a57b354fadf9339..c611d395a7aaa87bade983bc19e019ff9cdfbb97 100644 --- a/docs/version2.x/ppocr/blog/multi_languages.md +++ b/docs/version2.x/ppocr/blog/multi_languages.md @@ -36,10 +36,10 @@ PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅 ```bash linenums="1" # cpu -pip install paddlepaddle +pip install "paddlepaddle<=2.6" # gpu -pip install paddlepaddle-gpu +pip install "paddlepaddle-gpu<=2.6" ``` ### 1.2 paddleocr package 安装 @@ -47,7 +47,7 @@ pip install paddlepaddle-gpu pip 安装 ```bash linenums="1" -pip install paddleocr +pip install "paddleocr<3.0" ``` 本地构建并安装 diff --git a/docs/version2.x/ppocr/blog/whl.md b/docs/version2.x/ppocr/blog/whl.md index 722c0462ed021fd83588b739f4e819d8477ce022..f3f0536872a3b76b0273157fa1f6b0d2aa67b762 100644 --- a/docs/version2.x/ppocr/blog/whl.md +++ b/docs/version2.x/ppocr/blog/whl.md @@ -12,7 +12,7 @@ comments: true pip安装 ```bash linenums="1" -pip install paddleocr +pip install "paddleocr<3.0" ``` 本地构建并安装 diff --git a/docs/version2.x/ppocr/quick_start.en.md b/docs/version2.x/ppocr/quick_start.en.md index 0bdc00d92e79771f40f3fca8340118ccb2242ef1..8ef1b147c78d27bb6ceefddbf9690dc4d7ee13d4 100644 --- a/docs/version2.x/ppocr/quick_start.en.md +++ b/docs/version2.x/ppocr/quick_start.en.md @@ -15,13 +15,13 @@ comments: true - If you have CUDA 11 installed on your machine, please run the following command to install ```bash linenums="1" - pip install paddlepaddle-gpu + pip install "paddlepaddle-gpu<=2.6" ``` - If you have no available GPU on your machine, please run the following command to install the CPU version ```bash linenums="1" - python -m pip install paddlepaddle + python -m pip install "paddlepaddle<=2.6" ``` For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/en/install/quick) for operation. diff --git a/docs/version2.x/ppocr/quick_start.md b/docs/version2.x/ppocr/quick_start.md index d89b539bb766bb69dc4528ca8c7c3fa742fadc2f..d98bbcbaebc73c64bc464ceda3f77dc60da90a72 100644 --- a/docs/version2.x/ppocr/quick_start.md +++ b/docs/version2.x/ppocr/quick_start.md @@ -17,13 +17,13 @@ comments: true - 您的机器安装的是CUDA 11,请运行以下命令安装 ```bash linenums="1" - pip install paddlepaddle-gpu + pip install "paddlepaddle-gpu<=2.6" ``` - 您的机器是CPU,请运行以下命令安装 ```bash linenums="1" - pip install paddlepaddle + pip install "paddlepaddle<=2.6" ``` 更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 @@ -31,7 +31,7 @@ comments: true ### 1.2 安装PaddleOCR whl包 ```bash linenums="1" -pip install paddleocr +pip install "paddleocr<3.0" ``` - 对于Windows环境用户:直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装。 diff --git a/docs/version2.x/ppstructure/model_train/train_kie.en.md b/docs/version2.x/ppstructure/model_train/train_kie.en.md index 3e64d70a0e71997a7f0010f8d59c35331b91e596..97cb6b9234ae6a56305b2542e5debf0359e272e3 100644 --- a/docs/version2.x/ppstructure/model_train/train_kie.en.md +++ b/docs/version2.x/ppstructure/model_train/train_kie.en.md @@ -89,7 +89,7 @@ cd PaddleOCR pip install -r requirements.txt pip install -r ppstructure/kie/requirements.txt # 安装PaddleOCR引擎用于预测 -pip install paddleocr -U +pip install "paddleocr<3.0" ``` NOTE: For KIE tasks, it is necessary to downgrade the Paddle framework version (Paddle<2.6) and the PaddleNLP version (PaddleNLP<2.6). diff --git a/docs/version2.x/ppstructure/model_train/train_kie.md b/docs/version2.x/ppstructure/model_train/train_kie.md index 4ac026437e3bd9736b618c5ee52a9bbba55d1b9d..affd7d4129d19311527891396904b1282b4ca35d 100644 --- a/docs/version2.x/ppstructure/model_train/train_kie.md +++ b/docs/version2.x/ppstructure/model_train/train_kie.md @@ -88,7 +88,7 @@ cd PaddleOCR pip install -r requirements.txt pip install -r ppstructure/kie/requirements.txt # 安装PaddleOCR引擎用于预测 -pip install paddleocr -U +pip install "paddleocr<3.0" ``` NOTE: 对于KIE任务需要降低Paddle框架版本(Paddle<2.6),和PaddleNLP版本(PaddleNLP<2.6)。 diff --git a/docs/version2.x/ppstructure/quick_start.en.md b/docs/version2.x/ppstructure/quick_start.en.md index d21ed9c90b8103e5890db0b20a6dec237b680cf2..b80c75417ab653c6ffd8836962f505a7d8a56f9e 100644 --- a/docs/version2.x/ppstructure/quick_start.en.md +++ b/docs/version2.x/ppstructure/quick_start.en.md @@ -13,19 +13,19 @@ comments: true - PaddlePaddle with CUDA 11.8 ```bash linenums="1" - python3 -m pip install paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + python3 -m pip install "paddlepaddle-gpu<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ ``` - PaddlePaddle with CUDA 12.3 ```bash linenums="1" - python3 -m pip install paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ + python3 -m pip install "paddlepaddle-gpu<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ ``` - If your machine does not have an available GPU, please run the following command to install the CPU version ```bash linenums="1" - python3 -m pip install paddlepaddle -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ + python3 -m pip install "paddlepaddle<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ ``` For more software version requirements, please refer to the instructions in the [Installation Document](https://www.paddlepaddle.org.cn/en/install/quick). @@ -33,7 +33,7 @@ For more software version requirements, please refer to the instructions in the ### 1.2 Install PaddleOCR Whl Package ```bash linenums="1" -python3 -m pip install paddleocr +python3 -m pip install "paddleocr<3.0" # Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) python3 -m pip install paddleclas diff --git a/docs/version2.x/ppstructure/quick_start.md b/docs/version2.x/ppstructure/quick_start.md index ae1507f768664de61c37ab4dd12a77007569de1b..99fb5ec3baa30907331e5fef77e66bb6e20867be 100644 --- a/docs/version2.x/ppstructure/quick_start.md +++ b/docs/version2.x/ppstructure/quick_start.md @@ -13,19 +13,19 @@ comments: true - CUDA11.8 的 PaddlePaddle ```bash linenums="1" - python3 -m pip install paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + python3 -m pip install "paddlepaddle-gpu<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ ``` - CUDA12.3 的 PaddlePaddle ```bash linenums="1" - python3 -m pip install paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ + python3 -m pip install "paddlepaddle-gpu<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ ``` - 您的机器是CPU,请运行以下命令安装 ```bash linenums="1" - python3 -m pip install paddlepaddle -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ + python3 -m pip install "paddlepaddle<=2.6" -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ ``` 更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 @@ -33,7 +33,7 @@ comments: true ### 1.2 安装PaddleOCR whl包 ```bash linenums="1" -python3 -m pip install paddleocr +python3 -m pip install "paddleocr<3.0" # 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) python3 -m pip install paddleclas diff --git a/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md b/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md index 3ebb49eace96de157c4b8ecd4b0f9e9e7bd42d9e..f964426c049ece123553834e9c99103594937443 100644 --- a/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md +++ b/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md @@ -199,11 +199,139 @@ A single model can cover multiple languages and text types, with recognition acc
-More Demos +More Demos + +## Reference Data for Inference Performance + +Test Environment: + +- NVIDIA Tesla V100 +- Intel Xeon Gold 6271C +- PaddlePaddle 3.0.0 + +Tested on 200 images (including both general and document images). During testing, images are read from disk, so the image reading time and other associated overhead are also included in the total time consumption. If the images are preloaded into memory, the average time per image can be further reduced by approximately 25 ms. + +Unless otherwise specified: + +- PP-OCRv4_mobile_det and PP-OCRv4_mobile_rec models are used. +- Document orientation classification, image correction, and text line orientation classification are not used. +- `text_det_limit_type` is set to `"min"` and `text_det_limit_side_len` to `732`. + +### 1. Comparison of Inference Performance Between PP-OCRv5 and PP-OCRv4 + +| Config | Description | +| --------------- | ------------------------------------------------------------ | +| v5_mobile | Uses PP-OCRv5_mobile_det and PP-OCRv5_mobile_rec models. | +| v4_mobile | Uses PP-OCRv4_mobile_det and PP-OCRv4_mobile_rec models. | +| v5_server | Uses PP-OCRv5_server_det and PP-OCRv5_server_rec models. | +| v4_server | Uses PP-OCRv4_server_det and PP-OCRv4_server_rec models. | + +**GPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | Peak VRAM (MB) | Avg VRAM (MB) | +| ---------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | -------------- | ------------- | +| v5_mobile | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 4342.00 | 3258.95 | +| v4_mobile | 0.27 | 2246 | 111.20 | 1392.22 | 1318.76 | 1304.00 | 1166.46 | +| v5_server | 0.70 | 929 | 105.31 | 1634.85 | 1428.55 | 5402.00 | 4685.13 | +| v4_server | 0.44 | 1418 | 106.96 | 1455.34 | 1346.95 | 6760.00 | 5817.46 | + +**GPU, with high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | Peak VRAM (MB) | Avg VRAM (MB) | +| ---------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | -------------- | ------------- | +| v5_mobile | 0.50 | 1301 | 106.50 | 1338.12 | 1155.86 | 4112.00 | 3536.36 | +| v4_mobile | 0.21 | 2887 | 114.09 | 1113.27 | 1054.46 | 2072.00 | 1840.59 | +| v5_server | 0.60 | 1084 | 105.73 | 1980.73 | 1776.20 | 12150.00 | 11849.40 | +| v4_server | 0.36 | 1687 | 104.15 | 1186.42 | 1065.67 | 13058.00 | 12679.00 | + +**CPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | +| ---------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | +| v5_mobile | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| v4_mobile | 1.09 | 556 | 813.16 | 11996.30 | 6834.25 | +| v5_server | 3.79 | 172 | 799.24 | 50216.00 | 27902.40 | +| v4_server | 4.22 | 148 | 803.74 | 51428.70 | 28593.60 | + +**CPU, with high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | +| ---------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | +| v5_mobile | 1.14 | 571 | 339.68 | 3245.17 | 2560.55 | +| v4_mobile | 0.68 | 892 | 443.00 | 3057.38 | 2329.44 | +| v5_server | 3.56 | 183 | 797.03 | 45664.70 | 26905.90 | +| v4_server | 4.22 | 148 | 803.74 | 51428.70 | 28593.60 | + +> Note: PP-OCRv5 uses a larger dictionary in the recognition model, which increases inference time and causes slower performance compared to PP-OCRv4. + +### 2. Impact of Auxiliary Features on PP-OCRv5 Inference Performance + +| Config | Description | +| --------------- | --------------------------------------------------------------------------------------------------------- | +| base | No document orientation classification, no image correction, no text line orientation classification. | +| with_textline | Includes text line orientation classification only. | +| with_all | Includes document orientation classification, image correction, and text line orientation classification. | + +**GPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | Peak VRAM (MB) | Avg VRAM (MB) | +| -------------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | -------------- | ------------- | +| base | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 4342.00 | 3258.95 | +| with_textline | 0.59 | 1104 | 105.58 | 1765.64 | 1478.53 | 19.48 | 4350.00 | 3267.77 | +| with_all | 1.02 | 600 | 104.92 | 1924.23 | 1628.50 | 10.96 | 2632.00 | 2217.01 | + +**CPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | +| -------------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | +| base | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| with_textline | 1.50 | 434 | 799.47 | 12007.20 | 6882.22 | +| with_all | 1.93 | 316 | 646.49 | 11759.60 | 6940.54 | + +> Note: Auxiliary features such as image unwarping can impact inference accuracy. More features do not necessarily yield better results and may increase resource usage. + +### 3. Impact of Input Scaling Strategy in Text Detection Module on PP-OCRv5 Inference Performance + +| Config | Description | +| ----------------- | -------------------------------------------------------------------------------------- | +| mobile_min_1280 | Uses `min` limit type and `text_det_limit_side_len=1280` with PP-OCRv5_mobile models. | +| mobile_min_736 | Same as default, `min`, `side_len=736`. | +| mobile_max_960 | Uses `max` limit type and `side_len=960`. | +| mobile_max_640 | Uses `max` limit type and `side_len=640`. | +| server_min_1280 | Uses `min`, `side_len=1280` with PP-OCRv5_server models. | +| server_min_736 | Same as default, `min`, `side_len=736`. | +| server_max_960 | Uses `max`, `side_len=960`. | +| server_max_640 | Uses `max`, `side_len=640`. | + +**GPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | Peak VRAM (MB) | Avg VRAM (MB) | +| ----------------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | -------------- | ------------- | +| mobile_min_1280 | 0.61 | 1071 | 109.12 | 1663.71 | 1439.72 | 4202.00 | 3550.32 | +| mobile_min_736 | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 4342.00 | 3258.95 | +| mobile_max_960 | 0.48 | 1313 | 103.49 | 1587.25 | 1395.48 | 2642.00 | 2319.03 | +| mobile_max_640 | 0.42 | 1436 | 103.07 | 1651.14 | 1422.62 | 2530.00 | 2149.11 | +| server_min_1280 | 0.82 | 795 | 107.17 | 1678.16 | 1428.94 | 10368.00 | 8320.43 | +| server_min_736 | 0.70 | 929 | 105.31 | 1634.85 | 1428.55 | 5402.00 | 4685.13 | +| server_max_960 | 0.59 | 1073 | 103.03 | 1590.19 | 1383.62 | 2928.00 | 2079.47 | +| server_max_640 | 0.54 | 1099 | 102.63 | 1602.09 | 1416.49 | 3152.00 | 2737.81 | + +**CPU, without high-performance inference:** + +| Config | Avg Time/Image (s) | Avg Chars/sec | Avg CPU Usage (%) | Peak RAM (MB) | Avg RAM (MB) | +| ----------------- | ------------------ | ------------- | ----------------- | ------------- | ------------ | +| mobile_min_1280 | 1.64 | 398 | 799.45 | 12344.10 | 7100.60 | +| mobile_min_736 | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| mobile_max_960 | 1.21 | 521 | 800.13 | 11099.10 | 6369.49 | +| mobile_max_640 | 1.01 | 597 | 802.52 | 9585.48 | 5573.52 | +| server_min_1280 | 4.48 | 145 | 800.49 | 50683.10 | 28273.30 | +| server_min_736 | 3.79 | 172 | 799.24 | 50216.00 | 27902.40 | +| server_max_960 | 2.67 | 237 | 797.63 | 49362.50 | 26075.60 | +| server_max_640 | 2.36 | 251 | 795.18 | 45656.10 | 24900.80 | # Deployment and Secondary Development * **Multiple System Support**: Compatible with mainstream operating systems including Windows, Linux, and Mac. * **Multiple Hardware Support**: Besides NVIDIA GPUs, it also supports inference and deployment on Intel CPU, Kunlun chips, Ascend, and other new hardware. * **High-Performance Inference Plugin**: Recommended to combine with high-performance inference plugins to further improve inference speed. See [High-Performance Inference Guide](../../deployment/high_performance_inference.md) for details. * **Service Deployment**: Supports highly stable service deployment solutions. See [Service Deployment Guide](../../deployment/serving.md) for details. -* **Secondary Development Capability**: Supports custom dataset training, dictionary extension, and model fine-tuning. Example: To add Korean recognition, you can extend the dictionary and fine-tune the model, seamlessly integrating into existing production lines. See [Text Recognition Module Usage Tutorial](../../module_usage/text_recognition.md) for details. +* **Secondary Development Capability**: Supports custom dataset training, dictionary extension, and model fine-tuning. Example: To add Korean recognition, you can extend the dictionary and fine-tune the model, seamlessly integrating into existing pipelines. See [Text Detection Module Usage Tutorial](../../module_usage/text_detection.en.md) and [Text Recognition Module Usage Tutorial](../../module_usage/text_recognition.en.md) for details. diff --git a/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md b/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md index 2ac7b5020cd3a1059fbd96f4519217087b0387a1..ea3151ee75677aa7215d37ef82108569f12d0200 100644 --- a/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md +++ b/docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md @@ -201,11 +201,140 @@
-更多示例 +更多示例 -# 四、部署与二次开发 +## 四、推理性能参考数据 + +测试环境: + +- NVIDIA Tesla V100 +- Intel Xeon Gold 6271C +- PaddlePaddle 3.0.0 + +在 200 张图像(包括通用图像与文档图像)上测试。测试时从磁盘读取图像,因此读图时间及其他额外开销也被包含在总耗时内。如果将图像提前载入到内存,可进一步减少平均每图约 25 ms 的时间开销。 + +如果不特别说明,则: + +- 使用 PP-OCRv4_mobile_det 和 PP-OCRv4_mobile_rec 模型。 +- 不使用文档图像方向分类、文本图像矫正、文本行方向分类。 +- 将 `text_det_limit_type` 设置为 `"min"`、`text_det_limit_side_len` 设置为 `732`。 + +### 1. PP-OCRv5 与 PP-OCRv4 推理性能对比 + +| 配置 | 说明 | +| --- | --- | +| v5_mobile | 使用 PP-OCRv5_mobile_det 和 PP-OCRv5_mobile_rec 模型。 | +| v4_mobile | 使用 PP-OCRv4_mobile_det 和 PP-OCRv4_mobile_rec 模型。 | +| v5_server | 使用 PP-OCRv5_server_det 和 PP-OCRv5_server_rec 模型。 | +| v4_server | 使用 PP-OCRv4_server_det 和 PP-OCRv4_server_rec 模型。 | + +**GPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | 峰值 VRAM 用量(MB) | 平均 VRAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | --- | --- | +| v5_mobile | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 18.95 | 4342.00 | 3258.95 | +| v4_mobile | 0.27 | 2246 | 111.20 | 1392.22 | 1318.76 | 28.90 | 1304.00 | 1166.46 | +| v5_server | 0.70 | 929 | 105.31 | 1634.85 | 1428.55 | 36.21 | 5402.00 | 4685.13 | +| v4_server | 0.44 | 1418 | 106.96 | 1455.34 | 1346.95 | 58.82 | 6760.00 | 5817.46 | + +**GPU,使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | 峰值 VRAM 用量(MB) | 平均 VRAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | --- | --- | +| v5_mobile | 0.50 | 1301 | 106.50 | 1338.12 | 1155.86 | 11.97 | 4112.00 | 3536.36 | +| v4_mobile | 0.21 | 2887 | 114.09 | 1113.27 | 1054.46 | 15.22 | 2072.00 | 1840.59 | +| v5_server | 0.60 | 1084 | 105.73 | 1980.73 | 1776.20 | 22.10 | 12150.00 | 11849.40 | +| v4_server | 0.36 | 1687 | 104.15 | 1186.42 | 1065.67 | 38.12 | 13058.00 | 12679.00 | + +**CPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | +| v5_mobile | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| v4_mobile | 1.09 | 556 | 813.16 | 11996.30 | 6834.25 | +| v5_server | 3.79 | 172 | 799.24 | 50216.00 | 27902.40 | +| v4_server | 4.22 | 148 | 803.74 | 51428.70 | 28593.60 | + +**CPU,使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | +| v5_mobile | 1.14 | 571 | 339.68 | 3245.17 | 2560.55 | +| v4_mobile | 0.68 | 892 | 443.00 | 3057.38 | 2329.44 | +| v5_server | 3.56 | 183 | 797.03 | 45664.70 | 26905.90 | +| v4_server | 4.22 | 148 | 803.74 | 51428.70 | 28593.60 | + +> 说明:PP-OCRv5 的识别模型使用了更大的字典,需要更长的推理时间,导致 PP-OCRv5 的推理速度慢于 PP-OCRv4。 + +### 2. 使用辅助功能对 PP-OCRv5 推理性能的影响 + +| 配置 | 说明 | +| --- | --- | +| base | 不使用文档图像方向分类、文本图像矫正、文本行方向分类。 | +| with_textline | 使用文本行方向分类,不使用文档图像方向分类、文本图像矫正。 | +| with_all | 使用文档图像方向分类、文本图像矫正、文本行方向分类。 | + +**GPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | 峰值 VRAM 用量(MB) | 平均 VRAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | --- | --- | +| base | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 18.95 | 4342.00 | 3258.95 | +| with_textline | 0.59 | 1104 | 105.58 | 1765.64 | 1478.53 | 19.48 | 4350.00 | 3267.77 | +| with_all | 1.02 | 600 | 104.92 | 1924.23 | 1628.50 | 10.96 | 2632.00 | 2217.01 | + +**CPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | +| base | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| with_textline | 1.50 | 434 | 799.47 | 12007.20 | 6882.22 | +| with_all | 1.93 | 316 | 646.49 | 11759.60 | 6940.54 | + +> 说明:文本图像矫正等辅助功能会对端到端推理精度造成影响,因此并不一定使用的辅助功能越多、资源用量越大。 + +### 3. 文本检测模块输入缩放尺寸策略对 PP-OCRv5 推理性能的影响 + +| 配置 | 说明 | +| --- | --- | +| mobile_min_1280 | 使用 PP-OCRv5_mobile_det 和 PP-OCRv5_mobile_rec 模型,将 `text_det_limit_type` 设置为 `"min"`、`text_det_limit_side_len` 设置为 `1280`。 | +| mobile_min_736 | 使用 PP-OCRv5_mobile_det 和 PP-OCRv5_mobile_rec 模型,将 `text_det_limit_type` 设置为 `"min"`、`text_det_limit_side_len` 设置为 `1280`。 | +| mobile_max_960 | 使用 PP-OCRv5_mobile_det 和 PP-OCRv5_mobile_rec 模型,将 `text_det_limit_type` 设置为 `"max"`、`text_det_limit_side_len` 设置为 `960`。 | +| mobile_max_640 | 使用 PP-OCRv5_mobile_det 和 PP-OCRv5_mobile_rec 模型,将 `text_det_limit_type` 设置为 `"max"`、`text_det_limit_side_len` 设置为 `640`。 | +| server_min_1280 | 使用 PP-OCRv5_server_det 和 PP-OCRv5_server_rec 模型,将 `text_det_limit_type` 设置为 `"min"`、`text_det_limit_side_len` 设置为 `1280`。 | +| server_min_736 | 使用 PP-OCRv5_server_det 和 PP-OCRv5_server_rec 模型,将 `text_det_limit_type` 设置为 `"min"`、`text_det_limit_side_len` 设置为 `1280`。 | +| server_max_960 | 使用 PP-OCRv5_server_det 和 PP-OCRv5_server_rec 模型,将 `text_det_limit_type` 设置为 `"max"`、`text_det_limit_side_len` 设置为 `960`。 | +| server_max_640 | 使用 PP-OCRv5_server_det 和 PP-OCRv5_server_rec 模型,将 `text_det_limit_type` 设置为 `"max"`、`text_det_limit_side_len` 设置为 `640`。 | + +**GPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | 峰值 VRAM 用量(MB) | 平均 VRAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | --- | --- | +| mobile_min_1280 | 0.61 | 1071 | 109.12 | 1663.71 | 1439.72 | 19.27 | 4202.00 | 3550.32 | +| mobile_min_736 | 0.56 | 1162 | 106.02 | 1576.43 | 1420.83 | 18.95 | 4342.00 | 3258.95 | +| mobile_max_960 | 0.48 | 1313 | 103.49 | 1587.25 | 1395.48 | 19.37 | 2642.00 | 2319.03 | +| mobile_max_640 | 0.42 | 1436 | 103.07 | 1651.14 | 1422.62 | 18.95 | 2530.00 | 2149.11 | +| server_min_1280 | 0.82 | 795 | 107.17 | 1678.16 | 1428.94 | 40.43 | 10368.00 | 8320.43 | +| server_min_736 | 0.70 | 929 | 105.31 | 1634.85 | 1428.55 | 36.21 | 5402.00 | 4685.13 | +| server_max_960 | 0.59 | 1073 | 103.03 | 1590.19 | 1383.62 | 33.42 | 2928.00 | 2079.47 | +| server_max_640 | 0.54 | 1099 | 102.63 | 1602.09 | 1416.49 | 30.77 | 3152.00 | 2737.81 | + +**CPU,不使用高性能推理:** + +| 配置 | 平均每图耗时(s) | 平均每秒预测字符数量 | 平均 CPU 利用率(%) | 峰值 RAM 用量(MB) | 平均 RAM 用量(MB) | +| --- | --- | --- | --- | --- | --- | +| mobile_min_1280 | 1.64 | 398 | 799.45 | 12344.10 | 7100.60 | +| mobile_min_736 | 1.43 | 455 | 798.93 | 11695.40 | 6829.09 | +| mobile_max_960 | 1.21 | 521 | 800.13 | 11099.10 | 6369.49 | +| mobile_max_640 | 1.01 | 597 | 802.52 | 9585.48 | 5573.52 | +| server_min_1280 | 4.48 | 145 | 800.49 | 50683.10 | 28273.30 | +| server_min_736 | 3.79 | 172 | 799.24 | 50216.00 | 27902.40 | +| server_max_960 | 2.67 | 237 | 797.63 | 49362.50 | 26075.60 | +| server_max_640 | 2.36 | 251 | 795.18 | 45656.10 | 24900.80 | + + +# 五、部署与二次开发 * **多系统支持**:兼容Windows、Linux、Mac等主流操作系统。 * **多硬件支持**:除了英伟达GPU外,还支持Intel CPU、昆仑芯、昇腾等新硬件推理和部署。 * **高性能推理插件**:推荐结合高性能推理插件进一步提升推理速度,详见[高性能推理指南](../../deployment/high_performance_inference.md)。 * **服务化部署**:支持高稳定性服务化部署方案,详见[服务化部署指南](../../deployment/serving.md)。 -* **二次开发能力**:支持自定义数据集训练、字典扩展、模型微调。举例:如需增加韩文识别,可扩展字典并微调模型,无缝集成到现有产线,详见[文本识别模块使用教程](../../module_usage/text_recognition.md) +* **二次开发能力**:支持自定义数据集训练、字典扩展、模型微调。举例:如需增加韩文识别,可扩展字典并微调模型,无缝集成到现有产线,详见[文本检测模块使用教程](../../module_usage/text_detection.md)及[文本识别模块使用教程](../../module_usage/text_recognition.md) diff --git a/docs/version3.x/deployment/on_device_deployment.md b/docs/version3.x/deployment/on_device_deployment.md index 650c418de877120e443881329a483534b132afd9..63f188dd66aee2344a7b1d678332c4404c8aee27 100644 --- a/docs/version3.x/deployment/on_device_deployment.md +++ b/docs/version3.x/deployment/on_device_deployment.md @@ -1,3 +1,472 @@ -# 端侧部署 +# OCR 端侧部署 demo 使用指南 -PaddleOCR 模型可通过 [PaddleX 端侧部署方案](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_deploy/edge_deploy.html) 实现端侧部署。关于 PaddleX 及其与 PaddleOCR 之间的关系,请参考 [PaddleOCR 与 PaddleX 的区别与联系](../paddleocr_and_paddlex.md#1-paddleocr-与-paddlex-的区别与联系)。 +- [快速开始](#快速开始) + - [环境准备](#环境准备) + - [部署步骤](#部署步骤) +- [代码介绍](#代码介绍) +- [工程详解](#工程详解) +- [进阶使用](#进阶使用) + - [更新预测库](#更新预测库) + - [转换 nb 模型](#转换-nb-模型) + - [更新模型、标签文件和预测图片](#更新模型标签文件和预测图片) + - [更新模型](#更新模型) + - [更新标签文件](#更新标签文件) + - [更新预测图片](#更新预测图片) + - [更新输入/输出预处理](#更新输入输出预处理) + +本指南主要介绍 PaddleX 端侧部署——OCR文字识别 demo 在 Android shell 上的运行方法。 + +本指南适配了以下 OCR 模型: + +- PP-OCRv3_mobile(cpu) +- PP-OCRv4_mobile(cpu) +- PP-OCRv5_mobile(cpu) + +## 快速开始 + +### 环境准备 + +1. 在本地环境安装好 CMAKE 编译工具,并在 [Android NDK 官网](https://developer.android.google.cn/ndk/downloads)下载当前系统的某个版本的 NDK 软件包。例如,在 Mac 上开发,需要在 Android NDK 官网下载 Mac 平台的 NDK 软件包 + + **环境要求** + - `CMake >= 3.10`(最低版本未经验证,推荐 3.20 及以上) + - `Android NDK >= r17c`(最低版本未经验证,推荐 r20b 及以上) + + **本指南所使用的测试环境:** + - `cmake == 3.20.0` + - `android-ndk == r20b` + +2. 准备一部 Android 手机,并开启 USB 调试模式。开启方法: `手机设置 -> 查找开发者选项 -> 打开开发者选项和 USB 调试模式` + +3. 电脑上安装 ADB 工具,用于调试。ADB 安装方式如下: + + 3.1. Mac 电脑安装 ADB: + + ```shell + brew cask install android-platform-tools + ``` + + 3.2. Linux 安装 ADB + + ```shell + sudo apt update + sudo apt install -y wget adb + ``` + + 3.3. Windows 安装 ADB + + win 上安装需要去谷歌的安卓平台下载 ADB 软件包进行安装:[链接](https://developer.android.com/studio) + + 打开终端,手机连接电脑,在终端中输入 + + ```shell + adb devices + ``` + + 如果有 device 输出,则表示安装成功。 + + ```shell + List of devices attached + 744be294 device + ``` + +### 物料准备 + +1. 克隆 `Paddle-Lite-Demo` 仓库的 `feature/paddle-x` 分支到 `PaddleX-Lite-Deploy` 目录。 + + ```shell + git clone -b feature/paddle-x https://github.com/PaddlePaddle/Paddle-Lite-Demo.git PaddleX-Lite-Deploy + ``` + +2. 填写 [问卷](https://paddle.wjx.cn/vm/eaaBo0H.aspx#) 下载压缩包,将压缩包放到指定解压目录,切换到指定解压目录后执行解压命令。 + + ```shell + # 1. 切换到指定解压目录 + cd PaddleX-Lite-Deploy/ocr/android/shell/ppocr_demo + + # 2. 执行解压命令 + unzip ocr.zip + ``` + +### 部署步骤 + +1. 将工作目录切换到 `PaddleX-Lite-Deploy/libs` 目录,运行 `download.sh` 脚本,下载需要的 Paddle Lite 预测库。此步骤只需执行一次,即可支持每个 demo 使用。 + +2. 将工作目录切换到 `PaddleX-Lite-Deploy/ocr/assets` 目录,运行 `download.sh` 脚本,下载 [paddle_lite_opt 工具](https://www.paddlepaddle.org.cn/lite/v2.10/user_guides/model_optimize_tool.html) 优化后的 nb 模型文件及预测图片、字典文件等物料。 + +3. 将工作目录切换到 `PaddleX-Lite-Deploy/ocr/android/shell/cxx/ppocr_demo` 目录,运行 `build.sh` 脚本,完成可执行文件的编译。 + +4. 将工作目录切换到 `PaddleX-Lite-Deploy/ocr/android/shell/cxx/ppocr_demo`,运行 `run.sh` 脚本,完成在端侧的预测。 + +**注意事项:** +- 在运行 `build.sh` 脚本前,需要更改 `NDK_ROOT` 指定的路径为实际安装的 NDK 路径。 +- 在 Windows 系统上可以使用 Git Bash 执行部署步骤。 +- 若在 Windows 系统上编译,需要将 `CMakeLists.txt` 中的 `CMAKE_SYSTEM_NAME` 设置为 `windows`。 +- 若在 Mac 系统上编译,需要将 `CMakeLists.txt` 中的 `CMAKE_SYSTEM_NAME` 设置为 `darwin`。 +- 在运行 `run.sh` 脚本时需保持 ADB 连接。 +- `download.sh` 和 `run.sh` 支持传入参数来指定模型,若不指定则默认使用 `PP-OCRv5_mobile` 模型。目前适配了以下模型: + - `PP-OCRv3_mobile` + - `PP-OCRv4_mobile` + - `PP-OCRv5_mobile` + +以下为实际操作时的示例: + +```shell + # 1. 下载需要的 Paddle Lite 预测库 + cd PaddleX-Lite-Deploy/libs + sh download.sh + + # 2. 下载 paddle_lite_opt 工具优化后的 nb 模型文件及预测图片、字典文件等物料 + cd ../ocr/assets + sh download.sh PP-OCRv5_mobile + + # 3. 完成可执行文件的编译 + cd ../android/shell/ppocr_demo + sh build.sh + +# 4. 预测 + sh run.sh PP-OCRv5_mobile +``` + +运行结果如下所示: + +```text +The detection visualized image saved in ./test_img_result.jpg +0 纯臻营养护发素 0.998541 +1 产品信息/参数 0.999094 +2 (45元/每公斤,100公斤起订) 0.948841 +3 每瓶22元,1000瓶起订) 0.961245 +4 【品牌】:代加工方式/OEMODM 0.970401 +5 【品名】:纯臻营养护发素 0.977496 +6 ODMOEM 0.955396 +7 【产品编号】:YM-X-3011 0.977864 +8 【净含量】:220ml 0.970538 +9 【适用人群】:适合所有肤质 0.995907 +10 【主要成分】:鲸蜡硬脂醇、燕麦β-葡聚 0.975813 +11 糖、椰油酰胺丙基甜菜碱、泛醌 0.964397 +12 (成品包材) 0.97298 +13 【主要功能】:可紧致头发磷层,从而达到 0.989097 +14 即时持久改善头发光泽的效果,给干燥的头 0.990088 +15 发足够的滋养 0.998037 +``` + +![预测结果](https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/pipeline_deploy/edge_PP-OCRv5_mobile.jpg) + +## 代码介绍 + +``` +. +├── ... +├── ocr +│ ├── ... +│ ├── android +│ │ ├── ... +│ │ └── shell +│ │ └── ppocr_demo +│ │ ├── src # 存放预测代码 +│ │ | ├── cls_process.cc # 方向分类器的推理全流程,包含预处理、预测和后处理三部分 +│ │ | ├── rec_process.cc # 识别模型 CRNN 的推理全流程,包含预处理、预测和后处理三部分 +│ │ | ├── det_process.cc # 检测模型 CRNN 的推理全流程,包含预处理、预测和后处理三部分 +│ │ | ├── det_post_process.cc # 检测模型 DB 的后处理文件 +│ │ | ├── pipeline.cc # OCR 文字识别 demo 推理全流程代码 +│ │ | └── MakeFile # 预测代码的 MakeFile 文件 +│ │ | +│ │ ├── CMakeLists.txt # CMake 文件,约束可执行文件的编译方法 +│ │ ├── README.md +│ │ ├── build.sh # 用于可执行文件的编译 +│ │ └── run.sh # 用于预测 +│ └── assets # 存放模型、测试图片、标签文件、config 文件 +│ ├── images # 存放测试图片 +│ ├── labels # 存放字典文件,更多详情可参考下文备注 +│ ├── models # 存放 nb 模型 +│ ├── config.txt +│ └── download.sh # 下载脚本,用于下载 paddle_lite_opt 工具优化后的模型 +└── libs # 存放不同端的预测库和 OpenCV 库。 + ├── ... + └── download.sh # 下载脚本,用于下载 Paddle Lite 预测库和 OpenCV 库 +``` + +**备注:** + + - `PaddleX-Lite-Deploy/ocr/assets/labels/` 目录下存放了 PP-OCRv3、PP-OCRv4 模型的字典文件 `ppocr_keys_v1.txt` 以及 PP-OCRv5 模型的字典文件 `ppocr_keys_ocrv5.txt`。在实际推理过程中,会根据模型名称自动选择相应的字典文件,因此无需手动干预。 + - 如果使用的 nb 模型是英文数字或其他语言的模型,需要更换为对应语言的字典。PaddleOCR 仓库提供了[部分字典文件](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3/ppocr/utils)。 + +```shell +# run.sh 脚本中可执行文件的参数含义: +adb shell "cd ${ppocr_demo_path} \ + && chmod +x ./ppocr_demo \ + && export LD_LIBRARY_PATH=${ppocr_demo_path}:${LD_LIBRARY_PATH} \ + && ./ppocr_demo \ + \"./models/${MODEL_NAME}_det.nb\" \ + \"./models/${MODEL_NAME}_rec.nb\" \ + ./models/${CLS_MODEL_FILE} \ + ./images/test.jpg \ + ./test_img_result.jpg \ + ./labels/${LABEL_FILE} \ + ./config.txt" + +第一个参数:ppocr_demo 可执行文件 +第二个参数:./models/${MODEL_NAME}_det.nb 检测模型的.nb文件 +第三个参数:./models/${MODEL_NAME}_rec.nb 识别模型的.nb文件 +第四个参数:./models/${CLS_MODEL_FILE} 文本行方向分类模型的.nb文件,默认根据模型名自动选择 +第五个参数:./images/test.jpg 测试图片 +第六个参数:./test_img_result.jpg 结果保存文件 +第七个参数:./labels/${LABEL_FILE} label 文件,默认根据模型名自动选择 +第八个参数:./config.txt 配置文件,模型的超参数配置文件,包含了检测器、分类器的超参数 +``` + +```shell +# config.txt 具体参数 List: +max_side_len 960 # 输入图像长宽大于 960 时,等比例缩放图像,使得图像最长边为 960 +det_db_thresh 0.3 # 用于过滤 DB 预测的二值化图像,设置为 0.3 对结果影响不明显 +det_db_box_thresh 0.5 # DB 后处理过滤 box 的阈值,如果检测存在漏框情况,可酌情减小 +det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本 +use_direction_classify 0 # 是否使用方向分类器,0 表示不使用,1 表示使用 +``` + +## 工程详解 + +OCR 文字识别 demo 由三个模型一起完成 OCR 文字识别功能,对输入图片先通过 `${MODEL_NAME}_det.nb` 模型做检测处理,然后通过 `ch_ppocr_mobile_v2.0_cls_slim_opt.nb` 模型做文字方向分类处理,最后通过 `${MODEL_NAME}_rec.nb` 模型完成文字识别处理。 + +1. `pipeline.cc` : OCR 文字识别 demo 预测全流程代码 + 该文件完成了三个模型串行推理的全流程控制处理,包含整个处理过程的调度处理。 + + - `Pipeline::Pipeline(...)` 方法完成调用三个模型类构造函数,完成模型加载和线程数、绑核处理及 predictor 创建处理 + - `Pipeline::Process(...)` 方法用于完成这三个模型串行推理的全流程控制处理 + +2. `cls_process.cc` 方向分类器的预测文件 + 该文件完成了方向分类器的预处理、预测和后处理过程 + + - `ClsPredictor::ClsPredictor()` 方法用于完成模型加载和线程数、绑核处理及 predictor 创建处理 + - `ClsPredictor::Preprocess()` 方法用于模型的预处理 + - `ClsPredictor::Postprocess()` 方法用于模型的后处理 + +3. `rec_process.cc` 识别模型 CRNN 的预测文件 + 该文件完成了识别模型 CRNN 的预处理、预测和后处理过程 + + - `RecPredictor::RecPredictor()` 方法用于完成模型加载和线程数、绑核处理及 predictor 创建处理 + - `RecPredictor::Preprocess()` 方法用于模型的预处理 + - `RecPredictor::Postprocess()` 方法用于模型的后处理 + +4. `det_process.cc` 检测模型 DB 的预测文件 + 该文件完成了检测模型 DB 的预处理、预测和后处理过程 + + - `DetPredictor::DetPredictor()` 方法用于完成模型加载和线程数、绑核处理及 predictor 创建处理 + - `DetPredictor::Preprocess()` 方法用于模型的预处理 + - `DetPredictor::Postprocess()` 方法用于模型的后处理 + +5. `db_post_process` 检测模型 DB 的后处理函数,包含 clipper 库的调用 + 该文件完成了检测模型 DB 的第三方库调用和其他后处理方法实现 + + - `std::vector>> BoxesFromBitmap(...)` 方法从 Bitmap 图中获取检测框 + - `std::vector>> FilterTagDetRes(...)` 方法根据识别结果获取目标框位置 + +## 进阶使用 + +如果快速开始部分无法满足你的需求,可以参考本节对 demo 进行自定义修改。 + +本节主要包含四部分: + +- 更新预测库; +- 转换 `.nb` 模型; +- 更新模型、标签文件和预测图片; +- 更新输入/输出预处理。 + +### 更新预测库 + +本指南所使用的预测库为最新版本(214rc),不推荐自行更新预测库。 + +若有使用其他版本的需求,可参考如下步骤更新预测库: + +* Paddle Lite 项目:https://github.com/PaddlePaddle/Paddle-Lite + * 参考 [Paddle Lite 源码编译文档](https://www.paddlepaddle.org.cn/lite/develop/source_compile/compile_env.html),编译 Android 预测库 + * 编译最终产物位于 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` + * 替换 c++ 库 + * 头文件 + 将生成的 `build.lite.android.xxx.gcc/inference_lite_lib.android.xxx/cxx/include` 文件夹替换 demo 中的 `PaddleX-Lite-Deploy/libs/android/cxx/include` + * armeabi-v7a + 将生成的 `build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/libs/libpaddle_lite_api_shared.so` 库替换 demo 中的 `PaddleX-Lite-Deploy/libs/android/cxx/libs/armeabi-v7a/libpaddle_lite_api_shared.so` + * arm64-v8a + 将生成的 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/libs/libpaddle_lite_api_shared.so` 库替换 demo 中的 `PaddleX-Lite-Deploy/libs/android/cxx/libs/arm64-v8a/libpaddle_lite_api_shared.so` + +### 转换 .nb 模型 + +若想使用自己训练的模型,可先参考以下流程得到 `.nb` 模型。 + +#### 终端命令方法(支持Mac/Ubuntu) + +1. 进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择所需版本下载对应的转化工具opt(推荐使用最新版本)。 + +2. 下载 opt 工具后,执行以下命令(此处以 2.14rc 版本的 linux_x86 opt 工具转换 PP-OCRv5_mobile_det 模型为例): + + ```bash + ./opt_linux_x86 \ + --model_file=PP-OCRv5_mobile_det/inference.pdmodel \ + --param_file=PP-OCRv5_mobile_det/inference.pdiparams \ + --optimize_out=PP-OCRv5_mobile_det \ + --valid_targets=arm + ``` + +有关使用终端命令方法转换 `.nb` 模型的详细介绍,可参考 Paddle-Lite 仓库的[使用可执行文件 opt](https://www.paddlepaddle.org.cn/lite/v2.12/user_guides/opt/opt_bin.html)。 + +#### python 脚本方法(支持Windows/Mac/Ubuntu) + +1. 安装最新版本的 paddlelite wheel 包。 + + ```bash + pip install --pre paddlelite + ``` + +2. 使用 python 脚本进行模型转换。以下为转换 PP-OCRv5_mobile_det 模型的示例代码: + + ```python + from paddlelite.lite import Opt + + # 1. 创建opt实例 + opt = Opt() + # 2. 指定输入模型地址 + opt.set_model_file("./PP-OCRv5_mobile_det/inference.pdmodel") + opt.set_param_file("./PP-OCRv5_mobile_det/inference.pdiparams") + # 3. 指定转化类型 + opt.set_valid_places("arm") + # 4. 指定输出模型地址 + opt.set_optimize_out("./PP-OCRv5_mobile_det") + # 5. 执行模型优化 + opt.run() + ``` + +有关使用 python 脚本方法转换 `.nb` 模型的详细介绍,可参考 Paddle-Lite 仓库的[使用 Python 脚本 opt](https://www.paddlepaddle.org.cn/lite/v2.12/api_reference/python_api/opt.html)。 + +**注意** + +- 有关模型优化工具 opt 的详细介绍,可参考 Paddle-Lite 仓库的[模型优化工具 opt](https://www.paddlepaddle.org.cn/lite/v2.12/user_guides/model_optimize_tool.html) +- 目前仅支持将 `.pdmodel` 格式的静态图模型转换为 `.nb` 格式。 + +### 更新模型、标签文件和预测图片 + +#### 更新模型 + +本指南只对 `PP-OCRv3_mobile`、`PP-OCRv4_mobile`、`PP-OCRv5_mobile` 模型进行了验证,其他模型不保证适用性。 + +如果你对 `PP-OCRv5_mobile` 模型进行了微调,并生成了一个名为 `PP-OCRv5_mobile_ft` 的新模型,可以按照以下步骤将原有模型替换为你的微调模型: + +1. 将 `PP-OCRv5_mobile_ft` 的 nb 模型存放到目录 `PaddleX-Lite-Deploy/ocr/assets/models/` 下,最终得到的文件结构如下: + + ```text + . + ├── ocr + │ ├── ... + │ └── assets + │ ├── models + │ │ ├── ... + │ │ ├── PP-OCRv5_mobile_ft_det.nb + │ │ └── PP-OCRv5_mobile_ft_rec.nb + │ └── ... + └── ... + ``` + +2. 将模型名加入到 `run.sh` 脚本中的 `MODEL_LIST`。 + + ```shell + MODEL_LIST="PP-OCRv3_mobile PP-OCRv4_mobile PP-OCRv5_mobile PP-OCRv5_mobile_ft" # 模型之间以单个空格为间隔 + ``` + +3. 运行 `run.sh` 脚本时使用模型目录名。 + + ```shell + sh run.sh PP-OCRv5_mobile_ft + ``` + +**注意:** + +- 如果更新模型中的输入 Tensor、Shape、和 Dtype 发生更新: + + - 更新文字方向分类器模型,则需要更新 `ppocr_demo/src/cls_process.cc` 中 `ClsPredictor::Preprocss` 函数 + - 更新检测模型,则需要更新 `ppocr_demo/src/det_process.cc` 中 `DetPredictor::Preprocss` 函数 + - 更新识别器模型,则需要更新 `ppocr_demo/src/rec_process.cc` 中 `RecPredictor::Preprocss` 函数 + +- 如果更新模型中的输出 Tensor 和 Dtype 发生更新: + + - 更新文字方向分类器模型,则需要更新 `ppocr_demo/src/cls_process.cc` 中 `ClsPredictor::Postprocss` 函数 + - 更新检测模型,则需要更新 `ppocr_demo/src/det_process.cc` 中 `DetPredictor::Postprocss` 函数 + - 更新识别器模型,则需要更新 `ppocr_demo/src/rec_process.cc` 中 `RecPredictor::Postprocss` 函数 + +#### 更新标签文件 + +如果需要更新标签文件,则需要将新的标签文件存放在目录 `PaddleX-Lite-Deploy/ocr/assets/labels/` 下,并参考模型更新方法更新 `PaddleX-Lite-Deploy/ocr/android/shell/ppocr_demo/run.sh` 中执行命令; + +以更新 `new_labels.txt` 为例: + + ```shell + # 代码文件 `PaddleX-Lite-Deploy/ocr/android/shell/ppocr_demo/run.sh` + # old + adb shell "cd ${ppocr_demo_path} \ + && chmod +x ./ppocr_demo \ + && export LD_LIBRARY_PATH=${ppocr_demo_path}:${LD_LIBRARY_PATH} \ + && ./ppocr_demo \ + \"./models/${MODEL_NAME}_det.nb\" \ + \"./models/${MODEL_NAME}_rec.nb\" \ + ./models/${CLS_MODEL_FILE} \ + ./images/test.jpg \ + ./test_img_result.jpg \ + ./labels/${LABEL_FILE} \ + ./config.txt" + # update + adb shell "cd ${ppocr_demo_path} \ + && chmod +x ./ppocr_demo \ + && export LD_LIBRARY_PATH=${ppocr_demo_path}:${LD_LIBRARY_PATH} \ + && ./ppocr_demo \ + \"./models/${MODEL_NAME}_det.nb\" \ + \"./models/${MODEL_NAME}_rec.nb\" \ + ./models/${CLS_MODEL_FILE} \ + ./images/test.jpg \ + ./test_img_result.jpg \ + ./labels/new_labels.txt \ + ./config.txt" + ``` + +#### 更新预测图片 + +如果需要更新预测图片,将更新的图片存放在 `PaddleX-Lite-Deploy/ocr/assets/images/` 下,更新文件 `PaddleX-Lite-Deploy/ocr/android/shell/ppocr_demo/rush.sh` 中执行命令; + +以更新 `new_pics.jpg` 为例: + + ```shell + # 代码文件 `PaddleX-Lite-Deploy/ocr/assets/images/run.sh` + ## old + adb shell "cd ${ppocr_demo_path} \ + && chmod +x ./ppocr_demo \ + && export LD_LIBRARY_PATH=${ppocr_demo_path}:${LD_LIBRARY_PATH} \ + && ./ppocr_demo \ + \"./models/${MODEL_NAME}_det.nb\" \ + \"./models/${MODEL_NAME}_rec.nb\" \ + ./models/${CLS_MODEL_FILE} \ + ./images/test.jpg \ + ./test_img_result.jpg \ + ./labels/${LABEL_FILE} \ + ./config.txt" + # update + adb shell "cd ${ppocr_demo_path} \ + && chmod +x ./ppocr_demo \ + && export LD_LIBRARY_PATH=${ppocr_demo_path}:${LD_LIBRARY_PATH} \ + && ./ppocr_demo \ + \"./models/${MODEL_NAME}_det.nb\" \ + \"\"./models/${MODEL_NAME}_rec.nb\"\" \ + ./models/${CLS_MODEL_FILE} \ + ./images/new_pics.jpg \ + ./test_img_result.jpg \ + ./labels/${LABEL_FILE} \ + ./config.txt" + ``` + +### 更新输入/输出预处理 + +- 更新输入预处理 + - 更新文字方向分类器模型,则需要更新 `ppocr_demo/src/cls_process.cc` 中 `ClsPredictor::Preprocss` 函数 + - 更新检测模型,则需要更新 `ppocr_demo/src/det_process.cc` 中 `DetPredictor::Preprocss` 函数 + - 更新识别器模型,则需要更新 `ppocr_demo/src/rec_process.cc` 中 `RecPredictor::Preprocss` 函数 + +- 更新输出预处理 + - 更新文字方向分类器模型,则需要更新 `ppocr_demo/src/cls_process.cc` 中 `ClsPredictor::Postprocss` 函数 + - 更新检测模型,则需要更新 `ppocr_demo/src/det_process.cc` 中 `DetPredictor::Postprocss` 函数 + - 更新识别器模型,则需要更新 `ppocr_demo/src/rec_process.cc` 中 `RecPredictor::Postprocss` 函数 diff --git a/docs/version3.x/deployment/python_and_cpp_infer.en.md b/docs/version3.x/deployment/python_and_cpp_infer.en.md new file mode 100644 index 0000000000000000000000000000000000000000..acdfe62a9c8284996578857faa78ff1a3a8af23a --- /dev/null +++ b/docs/version3.x/deployment/python_and_cpp_infer.en.md @@ -0,0 +1,11 @@ +# Inference with Python or C++ Prediction Engine + +Since the 2.x branch, inference with Python or C++ prediction engines has been a significant feature. This functionality allows users to load OCR-related models and perform inference without installing the wheel package. + +Due to differences in pre-processing, post-processing, and concatenation details compared to the wheel package, the inference results may slightly vary, and the two cannot be directly interchanged. + +For specific usage instructions, please refer to the following documents: + +* [Inference with Python Prediction Engine](../../version2.x/legacy/python_infer.md) +* [Inference with C++ Prediction Engine](../../version2.x/legacy/cpp_infer.md) +* [List of Supported Models](../../version2.x/legacy/model_list_2.x.md) diff --git a/docs/version3.x/deployment/python_and_cpp_infer.md b/docs/version3.x/deployment/python_and_cpp_infer.md new file mode 100644 index 0000000000000000000000000000000000000000..9e9c7be8d130dae9f0ce98af4c8dbe7193da7e48 --- /dev/null +++ b/docs/version3.x/deployment/python_and_cpp_infer.md @@ -0,0 +1,11 @@ +# 基于Python或C++预测引擎推理 + +自 2.x 分支以来,基于 Python 或 C++ 的预测引擎推理一直是一项重要功能。该功能允许用户在无需安装 wheel 包的情况下加载OCR相关模型并进行推理。 + +由于其在前后处理和串联细节上与 wheel 包存在差异,导致推理效果略有不同,两者无法直接互换使用。 + +有关具体使用方法,请参考以下文档: + +* [基于Python预测引擎推理](../../version2.x/legacy/python_infer.md) +* [基于C++预测引擎推理](../../version2.x/legacy/cpp_infer.md) +* [支持的模型列表](../../version2.x/legacy/model_list_2.x.md) diff --git a/docs/version3.x/deployment/serving.en.md b/docs/version3.x/deployment/serving.en.md index cc1960e02d177ffe34eddb10ddc99e78616bcd41..8307582abd07f7f759fbcee0e5f3946264241a23 100644 --- a/docs/version3.x/deployment/serving.en.md +++ b/docs/version3.x/deployment/serving.en.md @@ -60,7 +60,7 @@ The command-line options related to serving are as follows: --device -Deployment device for the pipeline. Defaults to cpu (if GPU is unavailable) or gpu (if GPU is available). +Deployment device for the pipeline. By default, a GPU will be used if available; otherwise, a CPU will be used." --host diff --git a/docs/version3.x/deployment/serving.md b/docs/version3.x/deployment/serving.md index 06ee5f7cd34ce0e2194322aaee9a155d2080a36b..5aa0ca346511d41c247d2bf1fd54fa52e5c3b8c9 100644 --- a/docs/version3.x/deployment/serving.md +++ b/docs/version3.x/deployment/serving.md @@ -60,7 +60,7 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) --device -产线部署设备。默认为 cpu(如 GPU 不可用)或 gpu(如 GPU 可用)。 +产线部署设备。默认情况下,当 GPU 可用时,将使用 GPU;否则使用 CPU。 --host diff --git a/docs/version3.x/model_list.md b/docs/version3.x/model_list.md index 6787802101a83227ebafafb1a2d0f7a4d0d07694..1a1f466d556d430c6901d6eadc583bac583c8296 100644 --- a/docs/version3.x/model_list.md +++ b/docs/version3.x/model_list.md @@ -127,21 +127,21 @@ PaddleOCR 内置了多条产线,每条产线都包含了若干模块,每个 PP-OCRv5_server_rec -- -- / - -- / - -206 M +86.38 +8.45/2.36 +122.69/122.69 +81 M PP-OCRv5_server_rec.yaml -推理模型/训练模型 +推理模型/训练模型 PP-OCRv5_mobile_rec -- -- / - -- / - -137 M +81.29 +1.46/5.43 +5.32/91.79 +16 M PP-OCRv5_mobile_rec.yaml -推理模型/训练模型 +推理模型/训练模型 PP-OCRv4_server_rec_doc diff --git a/docs/version3.x/module_usage/doc_img_orientation_classification.en.md b/docs/version3.x/module_usage/doc_img_orientation_classification.en.md index 302d7fb159849cb717d898a507cb0a0bd0d22759..dd6011af4239877e1c1f6a55b600d4aba93e0fee 100644 --- a/docs/version3.x/module_usage/doc_img_orientation_classification.en.md +++ b/docs/version3.x/module_usage/doc_img_orientation_classification.en.md @@ -118,90 +118,126 @@ Here is the visualization of the image: The explanations of relevant methods and parameters are as follows: * Instantiate the document image orientation classification model with `DocImgOrientationClassification` (taking `PP-LCNet_x1_0_doc_ori` as an example here). The specific explanations are as follows: - - - - + + + - - + - - + - + - - + - - + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Parameter DescriptionParameter TypeOptionsDefault ValueTypeDefault
model_name Model name strNoneNonePP-LCNet_x1_0_doc_ori
model_dir Model storage path strNoneNoneNone
deviceModel inference deviceDevice(s) to use for inference.
+Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
+If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
+By default, GPU 0 will be used if available; otherwise, the CPU will be used. +
strSupports specifying the specific card number of GPU, such as "gpu:0", specific card numbers of other hardware, such as "npu:0", and CPU, such as "cpu".gpu:0None
use_hpipWhether to enable the high-performance inference pluginenable_hpiWhether to use the high performance inference. boolNone False
hpi_configHigh-performance inference configurationdict | NoneNoneuse_tensorrtWhether to use the Paddle Inference TensorRT subgraph engine.boolFalse
min_subgraph_sizeMinimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine.int3
precisionPrecision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
Options: fp32, fp16, etc.
strfp32
enable_mkldnn +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +boolTrue
cpu_threadsNumber of threads to use for inference on CPUs.int10
top_kThe top-k value for prediction results. If not specified, the default value in the official PaddleOCR model configuration is used. If the value is 5, the top 5 categories and their corresponding classification probabilities will be returned.int None
+ + * Among them, `model_name` must be specified. After specifying `model_name`, the model parameters built into PaddleX are used by default. On this basis, when `model_dir` is specified, the user-defined model is used. * Call the `predict()` method of the document image orientation classification model for inference prediction. This method will return a list of results. In addition, this module also provides the `predict_iter()` method. The two methods are completely consistent in terms of parameter acceptance and result return. The difference is that `predict_iter()` returns a `generator`, which can process and obtain prediction results step by step, suitable for scenarios where large datasets need to be processed or memory needs to be saved. You can choose either of these two methods according to your actual needs. The parameters of the `predict()` method are `input` and `batch_size`, and the specific explanations are as follows: - - - - + + - - - - + + - + - + + + + + +
Parameter DescriptionParameter TypeOptionsDefault ValueTypeDefault
inputData to be predicted, supporting multiple input typesPython Var/str/list +Input data to be predicted. Required. Supports multiple input types:
    -
  • Python variable, such as image data represented by numpy.ndarray
  • -
  • File path, such as the local path of an image file: /root/data/img.jpg
  • -
  • URL link, such as the network URL of an image file: Example
  • -
  • Local directory, which should contain the data files to be predicted, such as the local path: /root/data/
  • -
  • List, the elements of the list should be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
  • +
  • Python Var: e.g., numpy.ndarray representing image data
  • +
  • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
  • +
  • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
NonePython Var|str|list
batch_sizeBatch sizeBatch size, positive integer. intAny integer 1
top_kThe top-k value for prediction results. If not specified, the value provided when the model was instantiated will be used; if it was not specified at instantiation either, the default value in the official PaddleOCR model configuration is used.intNone
* Process the prediction results. The prediction result for each sample is the corresponding Result object, and it supports operations such as printing, saving as an image, and saving as a `json` file: diff --git a/docs/version3.x/module_usage/doc_img_orientation_classification.md b/docs/version3.x/module_usage/doc_img_orientation_classification.md index 7336b4b465ef9a9fcd220c52a3fb38433a2c2c2c..dae49dc802b6c63bf92244f077ee62ef3fede3cb 100644 --- a/docs/version3.x/module_usage/doc_img_orientation_classification.md +++ b/docs/version3.x/module_usage/doc_img_orientation_classification.md @@ -125,50 +125,84 @@ for res in output: 参数 -参数说明 -参数类型 -可选项 +说明 +类型 默认值 + model_name 模型名称 str -无 - +PP-LCNet_x1_0_doc_ori model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
+例如:cpugpunpugpu:0gpu:0,1
+如指定多个设备,将进行并行推理。
+默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置 -dict | None -无 +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False + + +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn + +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
+ +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + +top_k +预测结果的前topk值,如果不指定,将默认使用PaddleOCR官方模型配置。若值为5,表示打印(返回)预测结果的前5个类别和对应的分类概率 +int None + -* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 + + +* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用 PaddleX 内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 * 调用文档图像方向分类模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input` 和 `batch_size`,具体说明如下: @@ -178,32 +212,33 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/list - +待预测数据,支持多种输入类型,必填。 -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 + +top_k +预测结果的前topk值,如果不指定,将默认使用实例化模型的值。若实例化也没有指定,则默认使用PaddleOCR官方模型配置。 +int +None + * 对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为图片、保存为`json`文件的操作: diff --git a/docs/version3.x/module_usage/doc_vlm.en.md b/docs/version3.x/module_usage/doc_vlm.en.md index e09e312aafd3ade844b3f3f898eeaf10f5c7112e..dba4b8f9755ac3a7cd3aab0ca17220de0050b57a 100644 --- a/docs/version3.x/module_usage/doc_vlm.en.md +++ b/docs/version3.x/module_usage/doc_vlm.en.md @@ -104,47 +104,74 @@ Explanations of related methods, parameters, etc., are as follows: Parameter Description Type -Options Default + model_name -Model Name +Model name str -None -None +PP-DocBee-2B model_dir -Model Storage Path +Model storage path str -None -None +None device -Model Inference Device +Device(s) to use for inference.
+Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
+If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
+By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU card number, such as "gpu:0", other hardware specific card numbers, such as "npu:0", CPU such as "cpu". -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin. Currently not supported. +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-performance inference configuration. Currently not supported. -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + + * Among them, `model_name` must be specified. After specifying `model_name`, the default PaddleX built-in model parameters will be used. On this basis, when specifying `model_dir`, user-defined models will be used. * Call the `predict()` method of the document visual language model for inference prediction. This method will return a result list. Additionally, this module also provides the `predict_iter()` method. Both are completely consistent in terms of parameter acceptance and result return, the difference being that `predict_iter()` returns a `generator`, capable of gradually processing and obtaining prediction results, suitable for handling large datasets or scenarios where memory saving is desired. You can choose to use either of these methods based on actual needs. The `predict()` method parameters include `input`, `batch_size`, with specific explanations as follows: @@ -155,25 +182,21 @@ Explanations of related methods, parameters, etc., are as follows: Parameter Description Type -Options Default input -Data to be predicted -dict - -Dict, as multimodal models have different input requirements, it needs to be determined based on the specific model. Specifically: -
  • PP-DocBee series input format is {'image': image_path, 'query': query_text}
  • +Input data. Required. Since multimodal models have different input requirements, please refer to the specific model for the correct format.
    +For example, for the PP-DocBee series models, the input format should be: {'image': image_path, 'query': query_text} +dict None batch_size -Batch Size +Batch size, positive integer. int -Integer 1 diff --git a/docs/version3.x/module_usage/doc_vlm.md b/docs/version3.x/module_usage/doc_vlm.md index 1b52ed804d58029531896cf3a88cbc22e2ec38c3..c7c1f5eacc7dae2ac2b892f7122d3ffa5ce01b36 100644 --- a/docs/version3.x/module_usage/doc_vlm.md +++ b/docs/version3.x/module_usage/doc_vlm.md @@ -107,48 +107,76 @@ for res in results: 参数 参数说明 参数类型 -可选项 默认值 + model_name 模型名称 str -无 - +PP-DocBee-2B model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
    +例如:cpugpunpugpu:0gpu:0,1
    +如指定多个设备,将进行并行推理。
    +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件。目前暂不支持。 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置。目前暂不支持。 -dict | None -无 -None +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False + + +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
    可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn + +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    + +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + -* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 + + +* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用 PaddleX 内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 * 调用文档类视觉语言模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input` 、 `batch_size`,具体说明如下: @@ -158,25 +186,21 @@ for res in results: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据 -dict - -Dict, 由于多模态模型对输入有不同的要求,需要根据具体的模型确定,具体而言: -
  • PP-DocBee系列的输入形式为{'image': image_path, 'query': query_text}
  • +待预测数据,必填。由于多模态模型对输入要求不同,请根据具体模型设定输入格式。
    +例如:对于 PP-DocBee 系列模型,输入形式应为:{'image': image_path, 'query': query_text} +dictbatch_size -批大小 +批大小,可设置为任意正整数。 int -整数 1 diff --git a/docs/version3.x/module_usage/formula_recognition.en.md b/docs/version3.x/module_usage/formula_recognition.en.md index c2b3af224d229ba662619a27b2e3bdf857334226..4f2951f3edf86e5545614f3c03d34f468fb8f785 100644 --- a/docs/version3.x/module_usage/formula_recognition.en.md +++ b/docs/version3.x/module_usage/formula_recognition.en.md @@ -166,47 +166,74 @@ Related methods and parameter descriptions are as follows: Parameter Description Type -Options Default + model_name -Model name +Name of the model str -All model names supported by PaddleX -None +PP-FormulaNet_plus-M model_dir Model storage path str -None -None +None device -Device used for model inference +Device(s) to use for inference.
    +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
    +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
    +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying a specific GPU card such as \"gpu:0\", other hardware card such as \"npu:0\", and CPU such as \"cpu\". -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-performance inference configuration -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
    Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + + * Among these, `model_name` must be specified. When `model_name` is provided, the built-in model parameters from PaddleX are used by default. If `model_dir` is also specified, it will use the user-defined model instead. * Call the `predict()` method of the formula recognition model to perform inference, which returns a result list. @@ -220,30 +247,28 @@ You can choose either method based on your actual needs. The `predict()` method Parameter Description Type -Options Default input -Input data to be predicted; supports multiple input types -Python Var/str/list - +Input data to be predicted. Required. Supports multiple input types: -None +Python Var|str|list + batch_size -Batch size +Batch size, positive integer. int -Any integer 1 diff --git a/docs/version3.x/module_usage/formula_recognition.md b/docs/version3.x/module_usage/formula_recognition.md index 60eabc3aeb3ed34d59f1a090f79e63e300f24a13..db7bab528a819cce1df6d90ce98766e791293eb7 100644 --- a/docs/version3.x/module_usage/formula_recognition.md +++ b/docs/version3.x/module_usage/formula_recognition.md @@ -172,53 +172,75 @@ sudo apt-get install texlive texlive-latex-base texlive-xetex latex-cjk-all texl 相关方法、参数等说明如下: * `FormulaRecognition`实例化公式识别模型(此处以`PP-FormulaNet_plus-M`为例),具体说明如下: - - +
    - - - + - - + - - + - + - - + - - + + - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数 参数说明 参数类型可选项 默认值
    model_name 模型名称 str所有支持的模型名称PP-FormulaNet_plus-M
    model_dir 模型存储路径 strNone
    device模型推理设备用于推理的设备。
    +例如:cpugpunpugpu:0gpu:0,1
    +如指定多个设备,将进行并行推理。
    +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 +
    str支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。gpu:0None
    use_hpip是否启用高性能推理插件enable_hpi是否启用高性能推理。 bool False
    hpi_config高性能推理配置dict | NoneNoneuse_tensorrt是否启用 Paddle Inference 的 TensorRT 子图引擎。boolFalse
    min_subgraph_size当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。int3
    precision当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
    可选项:fp32fp16 等。
    strfp32
    enable_mkldnn +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    +
    boolTrue
    cpu_threads在 CPU 上推理时使用的线程数量。int10
    - * 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 * 调用公式识别模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input` 和 `batch_size`,具体说明如下: @@ -229,30 +251,25 @@ sudo apt-get install texlive texlive-latex-base texlive-xetex latex-cjk-all texl 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/list - +待预测数据,支持多种输入类型,必填。 -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 diff --git a/docs/version3.x/module_usage/layout_detection.en.md b/docs/version3.x/module_usage/layout_detection.en.md index 4e9a139ea72a2c50d0610d3b94b9de6a6da7395b..d7716850985eb75457116b9e089e6fc1a3e66e5a 100644 --- a/docs/version3.x/module_usage/layout_detection.en.md +++ b/docs/version3.x/module_usage/layout_detection.en.md @@ -338,113 +338,138 @@ Relevant methods, parameters, and explanations are as follows: Parameter Description Type -Options -Default Value +Default + model_name -Name of the model +Model name str -None -None +PP-DocLayout-L model_dir -Path to store the model +Model storage path str -None -None +None device -The device used for model inference +Device(s) to use for inference.
    +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
    +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
    +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -It supports specifying specific GPU card numbers, such as "gpu:0", other hardware card numbers, such as "npu:0", or CPU, such as "cpu". -gpu:0 +None -img_size -Size of the input image; if not specified, the default 800x800 be used -int/list/None +enable_hpi +Whether to use the high performance inference. +bool +False + + +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
    Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + +img_size +Input image size; if not specified, the default 800x800 will be used by PP-DocLayout_plus-L
    Examples: +int/list/None None threshold -Threshold for filtering low-confidence prediction results; if not specified, the default 0.5 will be used -float/dict/None - +Threshold for filtering low-confidence predictions; defaults to 0.5 if not specified
    Examples: +float/dict/None None layout_nms -Whether to use NMS post-processing to filter overlapping boxes; if not specified, the default False will be used -bool/None - +Whether to use NMS post-processing to filter overlapping boxes; if not specified, the default PaddleOCR official model configuration will be used
    Examples: +bool/None None layout_unclip_ratio -Scaling factor for the side length of the detection box; if not specified, the default 1.0 will be used -float/list/dict/None - +Scaling factor for the side length of the detection box; if not specified, the default PaddleX official model configuration will be used
    Examples: +float/list/dict/None +None + layout_merge_bboxes_mode -Merging mode for the detection boxes output by the model; if not specified, the default union will be used -string/dict/None - +Bounding box merge mode for model output; ; if not specified, the default PaddleOCR official model configuration will be used.
    Examples: +string/dict/None None - -use_hpip -Whether to enable the high-performance inference plugin -bool -None -False - - -hpi_config -High-performance inference configuration -dict | None -None -None - + +* Note that `model_name` must be specified. After specifying `model_name`, the default PaddleX built-in model parameters will be used. If `model_dir` is specified, the user-defined model will be used. + * The `predict()` method of the target detection model is called for inference prediction. The parameters of the `predict()` method are `input`, `batch_size`, and `threshold`, which are explained as follows: @@ -453,84 +478,87 @@ Relevant methods, parameters, and explanations are as follows: - - + - - - - + + - + - - - - + + - - - + - - - + + + - - - + -
    Parameter Description TypeOptionsDefault ValueDefault
    inputData for prediction, supporting multiple input typesPython Var/str/list +Input data to be predicted. Required. Supports multiple input types:
      -
    • Python Variable, such as image data represented by numpy.ndarray
    • -
    • File Path, such as the local path of an image file: /root/data/img.jpg
    • -
    • URL link, such as the network URL of an image file: 示例
    • -
    • Local Directory, the directory should contain the data files to be predicted, such as the local path: /root/data/
    • -
    • List, the elements of the list should be of the above-mentioned data types, such as [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
    • +
    • Python Var: e.g., numpy.ndarray representing image data
    • +
    • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
    • +
    • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
    NonePython Var|str|list
    batch_sizeBatch sizeBatch size, positive integer. intAny integer greater than 0 1
    thresholdThreshold for filtering low-confidence prediction resultsfloat/dict/None +Threshold for filtering low-confidence predictions. If not specified, the model's default will be used.
    Examples:
      -
    • float, e.g., 0.2, means filtering out all bounding boxes with a confidence score less than 0.2
    • -
    • Dictionary, with keys as int representing cls_id and values as float thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} means applying a threshold of 0.45 for cls_id 0, 0.48 for cls_id 2, and 0.4 for cls_id 7
    • -
    • None, not specified, will use the threshold parameter specified in create_model. If not specified in create_model, the default 0.5 will be used
    • +
    • float: e.g., 0.2, filters out all boxes with scores below 0.2
    • +
    • dict: keys are int representing cls_id, and values are float thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} applies thresholds of 0.45 to class 0, 0.48 to class 2, and 0.4 to class 7
    • +
    • None: if not specified, defaults to 0.5
    float/dict/NoneNone
    layout_nmsWhether to use NMS post-processing to filter overlapping boxes; if not specified, the default False will be usedbool/None +Whether to use NMS post-processing to filter overlapping boxes
    Examples:
      -
    • bool, True/False, indicates whether to use NMS for post-processing to filter overlapping boxes
    • -
    • None, not specified, will use the layout_nms parameter specified in create_model. If not specified in create_model, the default False will be used
    • +
    • bool: True/False, whether to apply NMS to filter overlapping detection boxes
    • +
    • None: if not specified, uses the layout_nms value from creat_model; if that is also not set, NMS will not be used by default
    bool/None None
    layout_unclip_ratioScaling factor for the side length of the detection box; if not specified, the default 1.0 will be usedfloat/list/dict/None +Scaling ratio for the detected box size. If not specified, defaults to 1.0
    Examples:
      -
    • float, a positive float number, e.g., 1.1, means expanding the width and height of the detection box by 1.1 times while keeping the center unchanged
    • -
    • List, e.g., [1.2, 1.5], means expanding the width by 1.2 times and the height by 1.5 times while keeping the center unchanged
    • -
    • dict, keys as int representing cls_id, values as float scaling factors, e.g., {0: (1.1, 2.0)} means cls_id 0 expanding the width by 1.1 times and the height by 2.0 times while keeping the center unchanged
    • -
    • None, not specified, will use the layout_unclip_ratio parameter specified in create_model. If not specified in create_model, the default 1.0 will be used
    • +
    • float:a positive float number, e.g., 1.1, means expanding the width and height of the detection box by 1.1 times while keeping the center unchanged
    • +
    • list: e.g., [1.2, 1.5], means expanding the width by 1.2 times and the height by 1.5 times while keeping the center unchanged
    • +
    • dict: keys are int representing cls_id, values are tuple, e.g., {0: (1.1, 2.0)} means cls_id 0 expanding the width by 1.1 times and the height by 2.0 times while keeping the center unchanged
    • +
    • None: if not specified, defaults to 1.0
    float/list/dict/NoneNone
    layout_merge_bboxes_modeMerging mode for the detection boxes output by the model; if not specified, the default union will be usedstring/dict/None +Merge mode for detected bounding boxes. Defaults to union if not specified
    Examples:
      -
    • large, when set to large, only the largest external box will be retained for overlapping detection boxes, and the internal overlapping boxes will be deleted
    • -
    • small, when set to small, only the smallest internal box will be retained for overlapping detection boxes, and the external overlapping boxes will be deleted
    • -
    • union, no filtering of boxes will be performed, and both internal and external boxes will be retained
    • -
    • dict, keys as int representing cls_id and values as merging modes, e.g., {0: "large", 2: "small"}
    • -
    • None, not specified, will use the layout_merge_bboxes_mode parameter specified in create_model. If not specified in create_model, the default union will be used
    • +
    • large: keeps only the largest outer box when overlapping/contained boxes exist
    • +
    • small: keeps only the smallest inner box when overlapping/contained boxes exist
    • +
    • union: no filtering, keeps all overlapping boxes
    • +
    • dict: keys are int cls_id, values are str, e.g., {0: "large", 2: "small"} applies different merge modes to different classes
    • +
    • None: if not specified, defaults to union
    string/dict/None None
    + + + + +

    If None is passed to predict(), the value set during model instantiation (__init__) will be used; if it was also None there, the framework defaults are applied:
    +    threshold=0.5, layout_nms=False, layout_unclip_ratio=1.0, layout_merge_bboxes_mode="union".

    + * Process the prediction results, with each sample's prediction result being the corresponding Result object, and supporting operations such as printing, saving as an image, and saving as a 'json' file: diff --git a/docs/version3.x/module_usage/layout_detection.md b/docs/version3.x/module_usage/layout_detection.md index 57b23f8e6f6099cc6ff472d8d6fec8f94f72ea96..a48a736af5227df2806b2952a7611b2345399035 100644 --- a/docs/version3.x/module_usage/layout_detection.md +++ b/docs/version3.x/module_usage/layout_detection.md @@ -334,205 +334,227 @@ for res in output: 相关方法、参数等说明如下: * `LayoutDetection`实例化目标检测模型(此处以`PP-DocLayout_plus-L`为例),具体说明如下: + - + - - + - - + - + - - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + - - - + - - - + + + - - - + - - - - - - - - - - - - - - - +
    参数 参数说明 参数类型可选项 默认值
    model_name 模型名称 strPP-DocLayout-L
    model_dir 模型存储路径 strNone
    device模型推理设备用于推理的设备。
    +例如:cpugpunpugpu:0gpu:0,1
    +如指定多个设备,将进行并行推理。
    +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 +
    str支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。gpu:0None
    img_size输入图像大小;如果不指定,PP-DocLayout_plus-L模型将默认使用800x800int/list/Noneenable_hpi是否启用高性能推理。boolFalse
    use_tensorrt是否启用 Paddle Inference 的 TensorRT 子图引擎。boolFalse
    min_subgraph_size当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。int3
    precision当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
    可选项:fp32fp16 等。
    strfp32
    enable_mkldnn +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    +
    boolTrue
    cpu_threads在 CPU 上推理时使用的线程数量。int10
    img_size输入图像大小;如果不指定,PP-DocLayout_plus-L模型将默认使用800x800
    可选示例:
      -
    • int, 如 640 , 表示将输入图像resize到640x640大小
    • -
    • 列表, 如 [640, 512] , 表示将输入图像resize到宽为640,高为512大小
    • -
    • None, 不指定,PP-DocLayout_plus-L模型将默认使用800x800
    • +
    • int:如 640 , 表示将输入图像resize到640x640大小
    • +
    • list: 如 [640, 512] , 表示将输入图像resize到宽为640,高为512大小
    • +
    • None:不指定,PP-DocLayout_plus-L模型将默认使用800x800
    int/list/None None
    threshold用于过滤掉低置信度预测结果的阈值;如果不指定,将默认使用0.5float/dict/None +用于过滤掉低置信度预测结果的阈值;如果不指定,将默认使用PaddleOCR官方模型配置
    可选示例:
      -
    • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • -
    • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • -
    • None, 不指定,将默认使用0.5
    • +
    • float:如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • +
    • dict:字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • +
    • None:不指定,将默认使用PaddleOCR官方模型配置
    float/dict/None None
    layout_nms是否使用NMS后处理,过滤重叠框;如果不指定,将默认不使用NMSbool/None +是否使用NMS后处理,过滤重叠框;如果不指定,将默认使用PaddleOCR官方模型配置
    可选示例:
      -
    • bool, True/False , 表示使用/不使用NMS进行检测框的后处理过滤重叠框
    • -
    • None, 不指定,将默认不使用
    • +
    • boolTrue/False , 表示使用/不使用NMS进行检测框的后处理过滤重叠框
    • +
    • None不指定,将默认使用PaddleOCR官方模型配置
    bool/None None
    layout_unclip_ratio检测框的边长缩放倍数;如果不指定,将默认使用1.0float/list/dict/None +检测框的边长缩放倍数;如果不指定,将默认使用PaddleOCR官方模型配置
    可选示例:
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • -
    • None, 不指定,将默认使用1.0
    • +
    • float:大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • +
    • list:如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • +
    • dict:字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • +
    • None:,不指定,将默认使用PaddleOCR官方模型配置
    float/list/dict/NoneNone
    layout_merge_bboxes_mode模型输出的检测框的合并处理模式;如果不指定,将默认使用union模式string/dict/None +模型输出的检测框的合并处理模式;如果不指定,将默认使用PaddleOCR官方模型配置
    可选示例:
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None, 不指定,将默认使用union模式
    • +
    • large: 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • +
    • small: 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • +
    • union: 不进行框的过滤处理,内外框都保留
    • +
    • dict: 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式/li> +
    • None: 不指定,将默认使用PaddleOCR官方模型配置
    string/dict/None None
    use_hpip是否启用高性能推理插件boolFalse
    hpi_config高性能推理配置dict | NoneNone
    * 调用目标检测模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input`、`batch_size`和`threshold`,具体说明如下: - - - - - - + + - + - - - - + + + - - - + - - - + + + - - - - + + -
    参数 参数说明 参数类型可选项 默认值
    input待预测数据,支持多种输入类型Python Var/str/list +待预测数据,支持多种输入类型,必填。
      -
    • Python变量,如numpy.ndarray表示的图像数据
    • -
    • 文件路径,如图像文件的本地路径:/root/data/img.jpg
    • -
    • URL链接,如图像文件的网络URL:示例
    • -
    • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
    • -
    • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    Python Var|str|list
    batch_size批大小批大小,可设置为任意正整数。 int大于0的任意整数 1
    threshold用于过滤掉低置信度预测结果的阈值float/dict/None +用于过滤掉低置信度预测结果的阈值;
    可选示例:
      -
    • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • -
    • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • -
    • None, 不指定,将默认使用 creat_model 指定的 threshold 参数,如果 creat_model 也没有指定,则默认使用0.5
    • +
    • float:如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • +
    • dict:字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • +
    • None: 不指定,将默认使用模型实例化指定的 threshold 参数,如果实例化也没有指定,则默认使用PaddleOCR官方模型配置
    float/dict/NoneNone
    layout_nms是否使用NMS后处理,过滤重叠框;如果不指定,将默认不使用NMSbool/None +是否使用NMS后处理,过滤重叠框;
    可选示例:
      -
    • bool, True/False , 表示使用/不使用NMS进行检测框的后处理过滤重叠框
    • -
    • None, 不指定,将默认使用 creat_model 指定的 layout_nms 参数,如果 creat_model 也没有指定,则默认不使用NMS
    • +
    • boolTrue/False , 表示使用/不使用NMS进行检测框的后处理过滤重叠框
    • +
    • None不指定,将默认使用模型实例化指定的 layout_nms 参数,如果实例化也没有指定,则默认使用PaddleOCR官方模型配置
    bool/None None
    layout_unclip_ratio检测框的边长缩放倍数;如果不指定,将默认使用1.0float/list/dict/None +检测框的边长缩放倍数。
    可选示例:
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0
    • -
    • None, 不指定,将默认使用 creat_model 指定的 layout_unclip_ratio 参数,如果 creat_model 也没有指定,则默认使用1.0
    • +
    • float:大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • +
    • list:如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩
    • +
    • dict:字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • +
    • None:不指定,将默认使用模型实例化指定的 layout_unclip_ratio 参数,如果实例化也没有指定,则默认使用PaddleOCR官方模型配置
    float/list/dict/NoneNone
    layout_merge_bboxes_mode模型输出的检测框的合并处理模式;如果不指定,将默认使用union模式string/dict/None +模型输出的检测框的合并处理模式;
    可选示例:
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None, 不指定,将默认使用 creat_model 指定的 layout_merge_bboxes_mode 参数,如果 creat_model 也没有指定,则默认使用union模式
    • +
    • large: 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • +
    • small: 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • +
    • union: 不进行框的过滤处理,内外框都保留
    • +
    • dict: 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式/li> +
    • None: 不指定,将默认使用模型实例化指定的 layout_merge_bboxes_mode 参数,如果模型实例化也没有指定,则默认使用PaddleOCR官方模型配置
    string/dict/NoneNone
    + + + +

    当调用 predict() 时该参数为 None 时,将继承模型实例化 (__init__) 时对应参数的值;若实例化时也未显式指定,则使用框架默认:
    +    threshold=0.5layout_nms=Falselayout_unclip_ratio=1.0layout_merge_bboxes_mode="union"

    + * 对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为图片、保存为`json`文件的操作: diff --git a/docs/version3.x/module_usage/module_overview.en.md b/docs/version3.x/module_usage/module_overview.en.md new file mode 100644 index 0000000000000000000000000000000000000000..2485629f52de91541ddb7f9d49910faf708b40aa --- /dev/null +++ b/docs/version3.x/module_usage/module_overview.en.md @@ -0,0 +1,3 @@ +# Module Overview + +A module is the smallest unit that implements basic functionality. Modules typically use a single model to accomplish specific tasks, such as text detection, image classification, and other basic functions. As fundamental building blocks, modules provide the necessary functional support for more complex application scenarios. This design approach allows users to flexibly select and combine different modules according to their needs, thereby simplifying the development process and enhancing development flexibility and efficiency. diff --git a/docs/version3.x/module_usage/module_overview.md b/docs/version3.x/module_usage/module_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..c60cb76094ef4d5401eee2cb5c3721076193b882 --- /dev/null +++ b/docs/version3.x/module_usage/module_overview.md @@ -0,0 +1,3 @@ +# 模块概述 + +模块是实现基本功能的最小单位。模块通常使用单个模型去完成特定的任务,比如文本检测、图像分类等基本功能。模块作为基础构建单元,为更复杂的应用场景提供了必要的功能支持。这种设计方式使得用户可以根据需要灵活选择和组合不同的模块,从而简化了开发流程,并提高了开发的灵活性和效率。 diff --git a/docs/version3.x/module_usage/seal_text_detection.en.md b/docs/version3.x/module_usage/seal_text_detection.en.md index fee93ac45581e33683493e84f3c93e9e2239105f..494fe2ae0776ea124e4785294fd479e2f5b01299 100644 --- a/docs/version3.x/module_usage/seal_text_detection.en.md +++ b/docs/version3.x/module_usage/seal_text_detection.en.md @@ -46,7 +46,7 @@ The seal text detection module typically outputs multi-point bounding boxes arou
    • Performance Test Environment
        -
      • Test Dataset: PaddleX Custom Dataset, Containing 500 Images of Circular Stamps.
      • +
      • Test Dataset: A Self-built Internal Dataset, Containing 500 Images of Circular Stamps.
      • Hardware Configuration:
        • GPU: NVIDIA Tesla T4
        • @@ -138,235 +138,180 @@ The explanations of related methods and parameters are as follows: Parameter -Parameter Description -Parameter Type -Options -Default Value +Description +Type +Default + model_name -Name of the model +Model name. All supported seal text detection model names, such as PP-OCRv4_mobile_seal_det. str -All model names supported by PaddleX for seal text detection -None +PP-OCRv4_mobile_seal_det model_dir -Path to store the model +Model storage path str -None -None +None device -The device used for model inference +Device(s) to use for inference.
          +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
          +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
          +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -It supports specifying specific GPU card numbers, such as "gpu:0", other hardware card numbers, such as "npu:0", or CPU, such as "cpu". -gpu:0 +None -limit_side_len -Limit on the side length of the image for detection -int/None - -
            -
          • int: Any integer greater than 0 -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - -None +enable_hpi +Whether to use the high performance inference. +bool +False -limit_type -Type of side length limit for detection -str/None - -
            -
          • str: Supports min and max. min ensures the shortest side of the image is not less than det_limit_side_len, max ensures the longest side is not greater than limit_side_len -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - - -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False -thresh -In the output probability map, pixels with scores greater than this threshold will be considered as text pixels -float/None - -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - -None +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 -box_thresh -If the average score of all pixels within a detection result box is greater than this threshold, the result will be considered as a text region -float/None - -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - -None +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
          Options: fp32, fp16, etc. +str +fp32 -max_candidates -Maximum number of text boxes to output -int/None +enable_mkldnn -
            -
          • int: Any integer greater than 0 -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - -None +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True -unclip_ratio -Expansion ratio for the Vatti clipping algorithm, used to expand the text region -float/None - -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the default value from the official PaddleX model configuration will be used
          - -None +cpu_threads +Number of threads to use for inference on CPUs. +int +10 -use_dilation -Whether to dilate the segmentation result -bool/None -True/False/None -None +limit_side_len +Limit on the side length of the input image for detection. int specifies the value. If set to None, the default value from the official PaddleOCR model configuration will be used. +int / None +None -use_hpip -Whether to enable the high-performance inference plugin -bool -None -False +limit_type +Type of image side length limitation. "min" ensures the shortest side of the image is no less than det_limit_side_len; "max" ensures the longest side is no greater than limit_side_len. If set to None, the default value from the official PaddleOCR model configuration will be used. +str / None +None -hpi_config -High-performance inference configuration -dict | None -None +thresh +Pixel score threshold. Pixels in the output probability map with scores greater than this threshold are considered text pixels. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used. +float / None +None + + +box_thresh +If the average score of all pixels inside the bounding box is greater than this threshold, the result is considered a text region. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used. +float / None None + +unclip_ratio +Expansion ratio for the Vatti clipping algorithm, used to expand the text region. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used. +float / None +None + + +input_shape +Input image size for the model in the format (C, H, W). If set to None, the model's default size will be used. +tuple / None +None + + -* The `model_name` must be specified. After specifying `model_name`, the built-in model parameters of PaddleX will be used by default. On this basis, if `model_dir` is specified, the user-defined model will be used. -* The `predict()` method of the seal text detection model is called for inference prediction. The parameters of the `predict()` method include `input`, `batch_size`, `limit_side_len`, `limit_type`, `thresh`, `box_thresh`, `max_candidates`, `unclip_ratio`, and `use_dilation`. The specific descriptions are as follows: +* The `model_name` must be specified. After specifying `model_name`, the built-in model parameters of PaddleX will be used by default. On this basis, if `model_dir` is specified, the user-defined model will be used. + +* The `predict()` method of the seal text detection model is called for inference prediction. The parameters of the `predict()` method include `input`, `batch_size`, `limit_side_len`, `limit_type`, `thresh`, `box_thresh`, `max_candidates`, `unclip_ratio`. The specific descriptions are as follows: - - - - + + + - - - - + + - + - - - - - - + + + - - - - - - + + + - - - - - + + + - - - - - - - - - - - - - + + + - - - - - - - - - - - - + + + +
          ParameterParameter DescriptionParameter TypeOptionsDefault ValueDescriptionTypeDefault
          inputData to be predicted, supporting multiple input typesPython Var/str/dict/list +Input data to be predicted. Required. Supports multiple input types:
            -
          • Python Variable, such as image data represented by numpy.ndarray
          • -
          • File Path, such as the local path of an image file: /root/data/img.jpg
          • -
          • URL Link, such as the web URL of an image file: Example
          • -
          • Local Directory, the directory should contain the data files to be predicted, such as the local path: /root/data/
          • -
          • List, the elements of the list should be of the above-mentioned data types, such as [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
          • +
          • Python Var: e.g., numpy.ndarray representing image data
          • +
          • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
          • +
          • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
          NonePython Var|str|list
          batch_sizeBatch sizeBatch size, positive integer. intAny integer greater than 0 1
          limit_side_lenSide length limit for detectionint/None -
            -
          • int: Any integer greater than 0 -
          • None: If set to None, the parameter value initialized by the model will be used by default
          NoneLimit on the side length of the input image for detection. int specifies the value. If set to None, the parameter value initialized by the model will be used by default.int / NoneNone
          limit_typeType of side length limit for detectionstr/None -
            -
          • str: Supports min and max. min indicates that the shortest side of the image is not less than det_limit_side_len, max indicates that the longest side of the image is not greater than limit_side_len -
          • None: If set to None, the parameter value initialized by the model will be used by default
          NoneType of image side length limitation. "min" ensures the shortest side of the image is no less than det_limit_side_len; "max" ensures the longest side is no greater than limit_side_len. If set to None, the parameter value initialized by the model will be used by default.str / NoneNone
          threshIn the output probability map, pixels with scores greater than this threshold will be considered as text pixelsfloat/None -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the parameter value initialized by the model will be used by default
          NonePixel score threshold. Pixels in the output probability map with scores greater than this threshold are considered text pixels. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default.float / NoneNone
          box_threshIf the average score of all pixels within the detection result box is greater than this threshold, the result will be considered as a text areafloat/None -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the parameter value initialized by the model will be used by default
          None
          max_candidatesMaximum number of text boxes to be outputint/None -
            -
          • int: Any integer greater than 0 -
          • None: If set to None, the parameter value initialized by the model will be used by default
          NoneIf the average score of all pixels inside the bounding box is greater than this threshold, the result is considered a text region. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default.float / NoneNone
          unclip_ratioExpansion coefficient of the Vatti clipping algorithm, used to expand the text areafloat/None -
            -
          • float: Any float greater than 0 -
          • None: If set to None, the parameter value initialized by the model will be used by default
          None
          use_dilationWhether to dilate the segmentation resultbool/NoneTrue/False/NoneNoneExpansion ratio for the Vatti clipping algorithm, used to expand the text region. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default.float / NoneNone
          + * Process the prediction results. Each sample's prediction result is a corresponding Result object, and it supports operations such as printing, saving as an image, and saving as a `json` file: diff --git a/docs/version3.x/module_usage/seal_text_detection.md b/docs/version3.x/module_usage/seal_text_detection.md index 65cfdb6193a9380c11af627e7d6062363611d0b3..70b071fe8c87cb1aaf7d84c4ccbc8f8fddd36f66 100644 --- a/docs/version3.x/module_usage/seal_text_detection.md +++ b/docs/version3.x/module_usage/seal_text_detection.md @@ -137,218 +137,172 @@ for res in output: - + - + - - + - - + - + - - + - - - - - - + + + + - - - - - - - + + + + - - - - - - + + + + - - - - - + + + + - - - + - +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
          + + + - - - - - + + + + - - - - - + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          参数 参数说明 参数类型可选项 默认值
          model_name模型名称模型名称。所有支持的印章文本检测模型名称,如 PP-OCRv4_mobile_seal_det str所有支持的印章文本检测模型名称"PP-OCRv4_mobile_seal_det"
          model_dir 模型存储路径 strNone
          device模型推理设备用于推理的设备。
          +例如:cpugpunpugpu:0gpu:0,1
          +如指定多个设备,将进行并行推理。
          +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 +
          str支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。gpu:0None
          limit_side_len检测的图像边长限制int/None -
            -
          • int: 大于0的任意整数 -
          • None: 如果设置为None, 将使用默认值:736
          Noneenable_hpi是否启用高性能推理。boolFalse
          limit_type检测的图像边长限制,检测的边长限制类型 str/None -
            -
          • str: 支持min和max. min表示保证图像最短边不小于det_limit_side_len, max: 表示保证图像最长边不大于limit_side_len -
          • None: 如果设置为None, 将使用默认值:“min”
          Noneuse_tensorrt是否启用 Paddle Inference 的 TensorRT 子图引擎。boolFalse
          thresh输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 float/None -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将使用默认值:0.2
          Nonemin_subgraph_size当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。int3
          box_thresh检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 float/None -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将使用默认值:0.6
          Noneprecision当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
          可选项:fp32fp16 等。
          strfp32
          unclip_ratioVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张 float/Noneenable_mkldnn -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将使用默认值:0.5
          NoneboolTrue
          use_dilation是否对分割结果进行膨胀 bool/NoneTrue/False/NoneNonecpu_threads在 CPU 上推理时使用的线程数量。int10
          use_hpip是否启用高性能推理插件boolFalselimit_side_len检测的图像边长限制:int 表示边长限制数值,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。int / NoneNone
          hpi_config高性能推理配置dict | Nonelimit_type检测的图像边长限制,检测的边长限制类型,"min" 表示保证图像最短边不小于det_limit_side_len,"max"表示保证图像最长边不大于limit_side_len。如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。str / NoneNone
          thresh像素得分阈值。输出概率图中得分大于该阈值的像素点被认为是文本像素。可选大于0的float任意浮点数,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
          box_thresh检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。可选大于0的float任意浮点数,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
          unclip_ratioVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张。可选大于0的任意浮点数。如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
          input_shape模型输入图像尺寸,格式为 (C, H, W)。若为 None 将默认使用PaddleOCR官方模型配置中的该参数值。tuple / None None
          -* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 -* 调用印章文本检测模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input`、 `batch_size`、 `limit_side_len`、 `limit_type`、 `thresh`、 `box_thresh`、 `max_candidates`、`unclip_ratio`和`use_dilation`,具体说明如下: +* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用 PaddleX 内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 + +* 调用印章文本检测模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input`、 `batch_size`、 `limit_side_len`、 `limit_type`、 `thresh`、 `box_thresh`、 `max_candidates`、`unclip_ratio`,具体说明如下: - - - - - + + - + - - - - - - + + + - - - - - - + + + - - - - - + + + - - - - - - - - - - - - - + + + - - - - - - - - - - - - + + + +
          参数 参数说明 参数类型可选项 默认值
          input待预测数据,支持多种输入类型Python Var/str/dict/list +待预测数据,支持多种输入类型,必填。
            -
          • Python变量,如numpy.ndarray表示的图像数据
          • -
          • 文件路径,如图像文件的本地路径:/root/data/img.jpg
          • -
          • URL链接,如图像文件的网络URL:示例
          • -
          • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
          • -
          • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          • +
          • Python Var:如 numpy.ndarray 表示的图像数据
          • +
          • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
          • +
          • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          Python Var|str|list
          batch_size批大小批大小,可设置为任意正整数。 int大于0的任意整数 1
          limit_side_len检测的图像边长限制int/None -
            -
          • int: 大于0的任意整数 -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None检测的图像边长限制:int 表示边长限制数值,如果设置为None, 如果设置为None, 将默认使用模型初始化的该参数值。int / NoneNone
          limit_type检测的图像边长限制,检测的边长限制类型 str/None -
            -
          • str: 支持min和max. min表示保证图像最短边不小于det_limit_side_len, max: 表示保证图像最长边不大于limit_side_len -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None检测的图像边长限制,检测的边长限制类型,"min" 表示保证图像最短边不小于det_limit_side_len,"max"表示保证图像最长边不大于limit_side_len。如果设置为None, 将默认使用模型初始化的该参数值。str / NoneNone
          thresh输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 float/None -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None像素得分阈值。输出概率图中得分大于该阈值的像素点被认为是文本像素。可选大于0的float任意浮点数,如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
          box_thresh检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 float/None -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None
          max_candidates输出的最大文本框数量 int/None -
            -
          • int: 大于0的任意整数 -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。可选大于0的float任意浮点数,如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
          unclip_ratioVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张 float/None -
            -
          • float: 大于0的任意浮点数 -
          • None: 如果设置为None, 将默认使用模型初始化的该参数值
          None
          use_dilation是否对分割结果进行膨胀 bool/NoneTrue/False/NoneNoneVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张。可选大于0的任意浮点数。如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
          * 对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为图片、保存为`json`文件的操作: diff --git a/docs/version3.x/module_usage/table_cells_detection.en.md b/docs/version3.x/module_usage/table_cells_detection.en.md index 7e89f86e7958fbbddf202ae495a05247e6752c57..b0b37c59ce3aec2548d382745e2f9ca6711fb640 100644 --- a/docs/version3.x/module_usage/table_cells_detection.en.md +++ b/docs/version3.x/module_usage/table_cells_detection.en.md @@ -81,7 +81,7 @@ The Table Cell Detection Module is a key component of the table recognition task ## III. Quick Start -> ❗ Before starting quickly, please first install the PaddleOCR wheel package. For details, please refer to the [installation tutorial](../installation.md). +> ❗ Before starting quickly, please first install the PaddleOCR wheel package. For details, please refer to the [installation tutorial](../installation.en.md). You can quickly experience it with one command: @@ -130,67 +130,90 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default + model_name -Model Name +Model name str -None -None +PP-DocLayout-L model_dir -Model Storage Path +Model storage path str -None -None +None device -Model Inference Device +Device(s) to use for inference.
          +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
          +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
          +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU card numbers, such as “gpu:0”, specific hardware card numbers, such as “npu:0”, CPU as “cpu”. -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-Performance Inference Configuration -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False -img_size -Input image size; if not specified, the PaddleX official model configuration will be used by default -int/list +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
          Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + +img_size +Size of the input image;If not specified, the default configuration of the PaddleOCR official model will be used
          Examples:
            -
          • int, e.g., 640, indicates resizing the input image to 640x640
          • -
          • list, e.g., [640, 512], indicates resizing the input image to a width of 640 and a height of 512
          • +
          • int: e.g. 640, resizes input image to 640x640
          • +
          • list: e.g. [640, 512], resizes input image to 640 width and 512 height
          +int/list/None None threshold -Threshold for filtering out low-confidence prediction results; if not specified, the PaddleX official model configuration will be used by default. In table cell detection tasks, appropriately lowering the threshold may help achieve more accurate results -float/dict - +Threshold to filter out low-confidence predictions; In table cell detection tasks, lowering the threshold appropriately may help to obtain more accurate results.
          Examples:
            -
          • float, e.g., 0.2, indicates filtering out all bounding boxes with confidence lower than 0.2
          • -
          • dictionary, where the key is of type int representing cls_id, and the value is of type float representing the threshold. For example, {0: 0.45, 2: 0.48, 7: 0.4} applies a threshold of 0.45 for category cls_id 0, 0.48 for category cls_id 1, and 0.4 for category cls_id 7
          • +
          • float, e.g., 0.3, indicates filtering out all bounding boxes with confidence lower than 0.3
          • +
          • dictionary, where the key is of type int representing cls_id, and the value is of type float representing the threshold. For example, {0: 0.3} applies a threshold of 0.3 for category cls_id 0
          +float/dict/None None @@ -205,42 +228,40 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default input -Data to be predicted, supports multiple input types -Python Var/str/list - +Input data to be predicted. Required. Supports multiple input types:
            -
          • Python Variable, such as numpy.ndarray representing image data
          • -
          • File Path, such as the local path of an image file: /root/data/img.jpg
          • -
          • URL Link, such as the network URL of an image file: Example
          • -
          • Local Directory, which should contain data files to be predicted, such as the local path: /root/data/
          • -
          • List, where list elements must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
          • +
          • Python Var: e.g., numpy.ndarray representing image data
          • +
          • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
          • +
          • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
          -None +Python Var|str|list + batch_size -Batch Size +Batch size, positive integer. int -Any integer 1 threshold -Threshold for filtering out low-confidence prediction results; if not specified, the threshold parameter specified in create_model will be used by default, and if create_model is not specified, the PaddleX official model configuration will be used -float/dict - +Threshold for filtering out low-confidence prediction results;
          Examples:
            -
          • float, e.g., 0.2, indicates filtering out all bounding boxes with confidence lower than 0.2
          • -
          • dictionary, where the key is of type int representing cls_id, and the value is of type float representing the threshold. For example, {0: 0.45, 2: 0.48, 7: 0.4} applies a threshold of 0.45 for category cls_id 0, 0.48 for category cls_id 1, and 0.4 for category cls_id 7
          • +
          • float: e.g., 0.2, filters out all boxes with scores below 0.2
          • +
          • dict: keys are int representing cls_id, and values are float thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} applies thresholds of 0.45 to class 0, 0.48 to class 2, and 0.4 to class 7
          • +
          • None: if not specified, the threshold parameter specified in creat_model will be used by default, and if creat_model also does not specify it, the default PaddleOCR official model configuration will be used
          +float/dict/None None diff --git a/docs/version3.x/module_usage/table_cells_detection.md b/docs/version3.x/module_usage/table_cells_detection.md index 4461c64226f042892a606b42b1eb24b9fe0cf80a..d203c115ec21b80b0389376f83e07e894318ea2b 100644 --- a/docs/version3.x/module_usage/table_cells_detection.md +++ b/docs/version3.x/module_usage/table_cells_detection.md @@ -129,68 +129,91 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 + model_name 模型名称 str -无 -无 +PP-DocLayout-L model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
          +例如:cpugpunpugpu:0gpu:0,1
          +如指定多个设备,将进行并行推理。
          +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置 -dict | None -无 -None +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False -img_size -输入图像大小 -int/list +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
          可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
          + +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + +img_size +输入图像大小;如果不指定,将默认使用PaddleOCR官方模型配置
          可选示例:
            -
          • int, 如 640 , 表示将输入图像resize到640x640大小
          • -
          • 列表, 如 [640, 512] , 表示将输入图像resize到宽为640,高为512大小
          • +
          • int:如 640 , 表示将输入图像resize到640x640大小
          • +
          • list: 如 [640, 512] , 表示将输入图像resize到宽为640,高为512大小
          -无 +int/list/None +None threshold -用于过滤掉低置信度预测结果的阈值。在表格单元格检测任务中,适当降低阈值可能有助于获得更准确的结果 -float/dict - +用于过滤掉低置信度预测结果的阈值;如果不指定,将默认使用PaddleOCR官方模型配置。在表格单元格检测任务中,适当降低阈值可能有助于获得更准确的结果
          可选示例:
            -
          • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
          • -
          • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
          • +
          • float:如 0.2, 表示过滤掉所有阈值小于0.2的目标框
          • +
          • dict:字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
          -无 +float/dict/None +None @@ -204,43 +227,37 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/list - +待预测数据,支持多种输入类型,必填。
            -
          • Python变量,如numpy.ndarray表示的图像数据
          • -
          • 文件路径,如图像文件的本地路径:/root/data/img.jpg
          • -
          • URL链接,如图像文件的网络URL:示例
          • -
          • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
          • -
          • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          • +
          • Python Var:如 numpy.ndarray 表示的图像数据
          • +
          • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
          • +
          • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 threshold -用于过滤掉低置信度预测结果的阈值 -float/dict - +用于过滤掉低置信度预测结果的阈值;如果不指定,将默认使用模型初始化指定的 threshold 参数,如果初始化也没有指定,则默认使用PaddleOCR官方模型配置
          可选示例:
            -
          • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
          • -
          • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
          • +
          • float:如 0.2, 表示过滤掉所有阈值小于0.2的目标框
          • +
          • dict:字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为1的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
          -无 +float/dict/None +None diff --git a/docs/version3.x/module_usage/table_classification.en.md b/docs/version3.x/module_usage/table_classification.en.md index 13214b5c0c9791632f0ed1fae807cfb68794eb5e..30fc0aaf2706b22cf162456e7d96331b91c99acf 100644 --- a/docs/version3.x/module_usage/table_classification.en.md +++ b/docs/version3.x/module_usage/table_classification.en.md @@ -74,7 +74,7 @@ The Table Classification Module is a key component in computer vision systems, r ## 3. Quick Start -> ❗ Before starting quickly, please first install the PaddleOCR wheel package. For details, please refer to the [installation tutorial](../installation.md). +> ❗ Before starting quickly, please first install the PaddleOCR wheel package. For details, please refer to the [installation tutorial](../installation.en.md). You can quickly experience it with one command: @@ -119,47 +119,74 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default + model_name -Model Name +Name of the model str -None -None +PP-LCNet_x1_0_doc_ori model_dir -Model Storage Path +Model storage path str -None -None +None device -Model Inference Device +Device(s) to use for inference.
          +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
          +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
          +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU card numbers, such as “gpu:0”, specific hardware card numbers, such as “npu:0”, CPU as “cpu”. -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-Performance Inference Configuration -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
          Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + + * Among them, `model_name` must be specified. After specifying `model_name`, the default model parameters built into PaddleX are used. When `model_dir` is specified, the user-defined model is used. * Call the `predict()` method of the table classification model for inference prediction. This method will return a result list. Additionally, this module also provides a `predict_iter()` method. Both methods are consistent in terms of parameter acceptance and result return. The difference is that `predict_iter()` returns a `generator`, which can process and obtain prediction results step by step, suitable for handling large datasets or scenarios where memory saving is desired. You can choose to use either of these methods according to your actual needs. The `predict()` method has parameters `input` and `batch_size`, with specific explanations as follows: @@ -170,30 +197,28 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default input -Data to be predicted, supports multiple input types -Python Var/str/list - +Input data to be predicted. Required. Supports multiple input types:
            -
          • Python Variable, such as numpy.ndarray representing image data
          • -
          • File Path, such as the local path of an image file: /root/data/img.jpg
          • -
          • URL Link, such as the network URL of an image file: Example
          • -
          • Local Directory, which should contain data files to be predicted, such as the local path: /root/data/
          • -
          • List, where list elements must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
          • +
          • Python Var: e.g., numpy.ndarray representing image data
          • +
          • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
          • +
          • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
          -None +Python Var|str|list + batch_size -Batch Size +Batch size, positive integer. int -Any integer 1 diff --git a/docs/version3.x/module_usage/table_classification.md b/docs/version3.x/module_usage/table_classification.md index 0be4daad9cc32e47783ca69a1ead29157ef96e88..b2a4cfb4b1e201e9a0fe864f01f5330f7621537c 100644 --- a/docs/version3.x/module_usage/table_classification.md +++ b/docs/version3.x/module_usage/table_classification.md @@ -117,45 +117,71 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 + model_name 模型名称 str -无 - +None model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
          +例如:cpugpunpugpu:0gpu:0,1
          +如指定多个设备,将进行并行推理。
          +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置 -dict | None -无 -None +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False + + +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
          可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn + +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
          + +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + * 其中,`model_name` 必须指定,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 @@ -168,30 +194,25 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/list - +待预测数据,支持多种输入类型,必填。
            -
          • Python变量,如numpy.ndarray表示的图像数据
          • -
          • 文件路径,如图像文件的本地路径:/root/data/img.jpg
          • -
          • URL链接,如图像文件的网络URL:示例
          • -
          • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
          • -
          • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          • +
          • Python Var:如 numpy.ndarray 表示的图像数据
          • +
          • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
          • +
          • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
          -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 @@ -272,6 +293,6 @@ for res in output: ## 四、二次开发 -由于 PaddleOCR 并不直接提供表格分类模块的训练,因此,如果需要训练表格分类模型,可以参考 [PaddleX 表格分类模块二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_classification.html#_4)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 +由于 PaddleOCR 并不直接提供表格分类模块的训练,因此,如果需要训练表格分类模型,可以参考 [PaddleX 表格分类模块二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_classification.html#_5)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 ## 五、FAQ diff --git a/docs/version3.x/module_usage/table_structure_recognition.en.md b/docs/version3.x/module_usage/table_structure_recognition.en.md index d89bb235ea7b3c5a90b8c0deca58b9b2b419e420..7864c218c5674c36f8de56aeafea11ceba21d508 100644 --- a/docs/version3.x/module_usage/table_structure_recognition.en.md +++ b/docs/version3.x/module_usage/table_structure_recognition.en.md @@ -6,7 +6,7 @@ comments: true ## 1. Overview -Table structure recognition is an important component of table recognition systems, capable of converting non-editable table images into editable table formats (such as HTML). The goal of table structure recognition is to identify the positions of rows, columns, and cells in tables. The performance of this module directly affects the accuracy and efficiency of the entire table recognition system. The table structure recognition module usually outputs HTML or Latex code for the table area, which is then passed as input to the table content recognition module for further processing. +Table structure recognition is an important component of table recognition systems, capable of converting non-editable table images into editable table formats (such as HTML). The goal of table structure recognition is to identify the positions of rows, columns, and cells in tables. The performance of this module directly affects the accuracy and efficiency of the entire table recognition system. The table structure recognition module usually outputs HTML code for the table area, which is then passed as input to the tabl recognition pipeline for further processing. ## 2. Supported Model List @@ -56,7 +56,7 @@ Table structure recognition is an important component of table recognition syste
          • Performance Test Environment
              -
            • Test Dataset: High-difficulty Chinese table recognition dataset built internally by PaddleX.
            • +
            • Test Dataset: High-difficulty Chinese table recognition dataset.
            • Hardware Configuration:
              • GPU: NVIDIA Tesla T4
              • @@ -139,85 +139,109 @@ Descriptions of related methods and parameters are as follows: Parameter Description Type -Options Default + model_name -Model name +Name of the model str -All model names supported by PaddleX -None +None model_dir Model storage path str -None -None +None device -Model inference device +Device(s) to use for inference.
                +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
                +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
                +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU cards, such as “gpu:0”, other hardware such as “npu:0”, CPU as “cpu”. -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin +enable_hpi +EWhether to use the high performance inference. bool -None False -hpi_config -High-performance inference configuration -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
                Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + -* Among them, `model_name` must be specified. After specifying `model_name`, the built-in model parameters of PaddleX are used by default. On this basis, if `model_dir` is specified, the user's custom model is used. +* Among them, `model_name` must be specified. If `model_dir` is specified, the user's custom model is used. * Call the `predict()` method of the table structure recognition model for inference prediction, which returns a result list. In addition, this module also provides the `predict_iter()` method. The two are completely consistent in parameter acceptance and result return. The difference is that `predict_iter()` returns a `generator`, which can process and obtain prediction results step by step, suitable for handling large datasets or scenarios where you want to save memory. You can choose to use either method according to your actual needs. The `predict()` method has parameters `input` and `batch_size`, described as follows: + - - - - - + + - + -
                Parameter Description TypeOptions Default
                inputData to be predicted, supports multiple input typesPython Var/str/list +Input data to be predicted. Required. Supports multiple input types:
                  -
                • Python variable, such as numpy.ndarray image data
                • -
                • File path, such as the local path of the image file: /root/data/img.jpg
                • -
                • URL link, such as the network URL of the image file: Example
                • -
                • Local directory, which should contain the data files to be predicted, e.g., /root/data/
                • -
                • List, list elements should be the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
                • +
                • Python Var: e.g., numpy.ndarray representing image data
                • +
                • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
                • +
                • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
                NonePython Var|str|list
                batch_sizeBatch sizeBatch size, positive integer. intAny integer 1
                - * For processing prediction results, the prediction result of each sample is the corresponding Result object, and supports printing and saving as a `json` file: @@ -290,7 +314,7 @@ Descriptions of related methods and parameters are as follows: ## 4. Secondary Development -If the above models are still not ideal for your scenario, you can try the following steps for secondary development. Here, training `SLANet` is used as an example, and for other models, just replace the corresponding configuration file. First, you need to prepare a dataset for table structure recognition, which can be prepared with reference to the format of the [table structure recognition demo data](https://paddle-model-ecology.bj.bcebos.com/paddlex/data/table_rec_dataset_examples.tar). Once ready, you can train and export the model as follows. After exporting, you can quickly integrate the model into the above API. Here, the table structure recognition demo data is used as an example. Before training the model, please make sure you have installed the dependencies required by PaddleOCR according to the [installation documentation](../installation.en.md). +If the above models are still not ideal for your scenario, you can try the following steps for secondary development. Here, training `SLANet_plus` is used as an example, and for other models, just replace the corresponding configuration file. First, you need to prepare a dataset for table structure recognition, which can be prepared with reference to the format of the [table structure recognition demo data](https://paddle-model-ecology.bj.bcebos.com/paddlex/data/table_rec_dataset_examples.tar). Once ready, you can train and export the model as follows. After exporting, you can quickly integrate the model into the above API. Here, the table structure recognition demo data is used as an example. Before training the model, please make sure you have installed the dependencies required by PaddleOCR according to the [installation documentation](../installation.en.md). ## 4.1 Dataset and Pretrained Model Preparation @@ -306,24 +330,35 @@ tar -xf table_rec_dataset_examples.tar ### 4.1.2 Download Pretrained Model ```shell -# Download SLANet pretrained model -wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/SLANet_pretrained.pdparams +# Download SLANet_plus pretrained model +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/SLANet_plus_pretrained.pdparams ``` ### 4.2 Model Training -PaddleOCR is modularized. When training the `SLANet` recognition model, you need to use the [configuration file](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/table/SLANet.yml) of `SLANet`. +PaddleOCR is modularized. When training the `SLANet_plus` recognition model, you need to use the [configuration file](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/table/SLANet_plus.yml) of `SLANet_plus`. The training commands are as follows: ```bash # Single card training (default training method) -python3 tools/train.py -c configs/table/SLANet.yml \ - -o Global.pretrained_model=./SLANet_pretrained.pdparams +python3 tools/train.py -c configs/table/SLANet_plus.yml \ + -o Global.pretrained_model=./SLANet_plus_pretrained.pdparams + Train.dataset.data_dir=./table_rec_dataset_examples \ + Train.dataset.label_file_list='[./table_rec_dataset_examples/train.txt]' \ + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' + # Multi-card training, specify card numbers via --gpus parameter -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml \ - -o Global.pretrained_model=./SLANet_pretrained.pdparams +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/table/SLANet_plus.yml \ + -o Global.pretrained_model=./SLANet_plus_pretrained.pdparams + -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ + Train.dataset.data_dir=./table_rec_dataset_examples \ + Train.dataset.label_file_list='[./table_rec_dataset_examples/train.txt]' \ + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' ``` @@ -334,21 +369,23 @@ You can evaluate the trained weights, such as `output/xxx/xxx.pdparams`, using t ```bash # Note to set the path of pretrained_model to the local path. If you use the model saved by your own training, please modify the path and file name to {path/to/weights}/{model_name}. # Demo test set evaluation - python3 tools/eval.py -c configs/table/SLANet.yml -o \ - Global.pretrained_model=output/xxx/xxx.pdparams + python3 tools/eval.py -c configs/table/SLANet_plus.yml -o \ + Global.pretrained_model=output/xxx/xxx.pdparams + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' ``` ### 4.4 Model Export ```bash - python3 tools/export_model.py -c configs/table/SLANet.yml -o \ - Global.pretrained_model=output/xxx/xxx.pdparams \ - save_inference_dir="./SLANet_infer/" + python3 tools/export_model.py -c configs/table/SLANet_plus.yml -o \ + Global.pretrained_model=output/xxx/xxx.pdparams \ + Global.save_inference_dir="./SLANet_plus_infer/" ``` - After exporting the model, the static graph model will be stored in `./SLANet_infer/` in the current directory. In this directory, you will see the following files: + After exporting the model, the static graph model will be stored in `./SLANet_plus_infer/` in the current directory. In this directory, you will see the following files: ``` - ./SLANet_infer/ + ./SLANet_plus_infer/ ├── inference.json ├── inference.pdiparams ├── inference.yml diff --git a/docs/version3.x/module_usage/table_structure_recognition.md b/docs/version3.x/module_usage/table_structure_recognition.md index 899b96bf475508f604d861236fb08aef0e1b7926..9c0f1cc6db21c5b27f2b1db2a998175cb2be1f86 100644 --- a/docs/version3.x/module_usage/table_structure_recognition.md +++ b/docs/version3.x/module_usage/table_structure_recognition.md @@ -6,7 +6,7 @@ comments: true ## 一、概述 -表格结构识别是表格识别系统中的重要组成部分,能够将不可编辑表格图片转换为可编辑的表格形式(例如html)。表格结构识别的目标是对表格的行、列和单元格位置进行识别,该模块的性能直接影响到整个表格识别系统的准确性和效率。表格结构识别模块通常会输出表格区域的html代码或Latex代码,这些代码将作为输入传递给表格内容识别模块进行后续处理。 +表格结构识别是表格识别系统中的重要组成部分,能够将不可编辑表格图片转换为可编辑的表格形式(例如html)。表格结构识别的目标是对表格的行、列和单元格位置进行识别,该模块的性能直接影响到整个表格识别系统的准确性和效率。表格结构识别模块会输出表格区域的html代码,这些代码将作为输入传递给表格识别产线进行后续处理。 ## 二、支持模型列表 @@ -139,45 +139,71 @@ for res in output: - + - - + - - + - + - - + - - + + - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                参数 参数说明 参数类型可选项 默认值
                model_name 模型名称 str所有支持的模型名称None
                model_dir 模型存储路径 strNone
                device模型推理设备用于推理的设备。
                +例如:cpugpunpugpu:0gpu:0,1
                +如指定多个设备,将进行并行推理。
                +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 +
                str支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。gpu:0None
                use_hpip是否启用高性能推理插件enable_hpi是否启用高性能推理。 bool False
                hpi_config高性能推理配置dict | NoneNoneuse_tensorrt是否启用 Paddle Inference 的 TensorRT 子图引擎。boolFalse
                min_subgraph_size当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。int3
                precision当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
                可选项:fp32fp16 等。
                strfp32
                enable_mkldnn +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
                +
                boolTrue
                cpu_threads在 CPU 上推理时使用的线程数量。int10
                * 其中,`model_name` 必须指定,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 @@ -190,30 +216,25 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/list - +待预测数据,支持多种输入类型,必填。
                  -
                • Python变量,如numpy.ndarray表示的图像数据
                • -
                • 文件路径,如图像文件的本地路径:/root/data/img.jpg
                • -
                • URL链接,如图像文件的网络URL:示例
                • -
                • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
                • -
                • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                • +
                • Python Var:如 numpy.ndarray 表示的图像数据
                • +
                • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
                • +
                • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 @@ -290,7 +311,7 @@ for res in output: ## 四、二次开发 -如果以上模型在您的场景上效果仍然不理想,您可以尝试以下步骤进行二次开发,此处以训练 `SLANet` 举例,其他模型替换对应配置文件即可。首先,您需要准备表格结构识别的数据集,可以参考[表格结构识别 Demo 数据](https://paddle-model-ecology.bj.bcebos.com/paddlex/data/table_rec_dataset_examples.tar)的格式准备,准备好后,即可按照以下步骤进行模型训练和导出,导出后,可以将模型快速集成到上述 API 中。此处以表格结构识别 Demo 数据示例。在训练模型之前,请确保已经按照[[安装文档](../installation.md)安装了 PaddleOCR 所需要的依赖。 +如果以上模型在您的场景上效果仍然不理想,您可以尝试以下步骤进行二次开发,此处以训练 `SLANet_plus` 举例,其他模型替换对应配置文件即可。首先,您需要准备表格结构识别的数据集,可以参考[表格结构识别 Demo 数据](https://paddle-model-ecology.bj.bcebos.com/paddlex/data/table_rec_dataset_examples.tar)的格式准备,准备好后,即可按照以下步骤进行模型训练和导出,导出后,可以将模型快速集成到上述 API 中。此处以表格结构识别 Demo 数据示例。在训练模型之前,请确保已经按照[安装文档](../installation.md)安装了 PaddleOCR 所需要的依赖。 ### 4.1 数据集、预训练模型准备 @@ -306,24 +327,35 @@ tar -xf table_rec_dataset_examples.tar #### 4.1.2 下载预训练模型 ```shell -# 下载 SLANet 预训练模型 -wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/SLANet_pretrained.pdparams +# 下载 SLANet_plus 预训练模型 +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/SLANet_plus_pretrained.pdparams ``` ### 4.2 模型训练 -PaddleOCR 对代码进行了模块化,训练 `SLANet` 识别模型时需要使用 `SLANet` 的[配置文件](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/table/SLANet.yml)。 +PaddleOCR 对代码进行了模块化,训练 `SLANet_plus` 识别模型时需要使用 `SLANet_plus` 的[配置文件](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/table/SLANet_plus.yml)。 训练命令如下: ```bash #单卡训练 (默认训练方式) -python3 tools/train.py -c configs/table/SLANet.yml \ - -o Global.pretrained_model=./SLANet_pretrained.pdparams +python3 tools/train.py -c configs/table/SLANet_plus.yml \ + -o Global.pretrained_model=./SLANet_plus_pretrained.pdparams + Train.dataset.data_dir=./table_rec_dataset_examples \ + Train.dataset.label_file_list='[./table_rec_dataset_examples/train.txt]' \ + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' + #多卡训练,通过--gpus参数指定卡号 -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml \ - -o Global.pretrained_model=./SLANet_pretrained.pdparams +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/table/SLANet_plus.yml \ + -o Global.pretrained_model=./SLANet_plus_pretrained.pdparams + -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ + Train.dataset.data_dir=./table_rec_dataset_examples \ + Train.dataset.label_file_list='[./table_rec_dataset_examples/train.txt]' \ + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' ``` @@ -334,21 +366,23 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs ```bash # 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型,请注意修改路径和文件名为{path/to/weights}/{model_name}。 # demo 测试集评估 - python3 tools/eval.py -c configs/table/SLANet.yml -o \ - Global.pretrained_model=output/xxx/xxx.pdparams + python3 tools/eval.py -c configs/table/SLANet_plus.yml -o \ + Global.pretrained_model=output/xxx/xxx.pdparams + Eval.dataset.data_dir=./table_rec_dataset_examples \ + Eval.dataset.label_file_list='[./table_rec_dataset_examples/val.txt]' ``` ### 4.4 模型导出 ```bash - python3 tools/export_model.py -c configs/table/SLANet.yml -o \ - Global.pretrained_model=output/xxx/xxx.pdparams \ - save_inference_dir="./SLANet_infer/" + python3 tools/export_model.py -c configs/table/SLANet_plus.yml -o \ + Global.pretrained_model=output/xxx/xxx.pdparams \ + Global.save_inference_dir="./SLANet_plus_infer/" ``` - 导出模型后,静态图模型会存放于当前目录的`./SLANet_infer/`中,在该目录下,您将看到如下文件: + 导出模型后,静态图模型会存放于当前目录的`./SLANet_plus_infer/`中,在该目录下,您将看到如下文件: ``` - ./SLANet_infer/ + ./SLANet_plus_infer/ ├── inference.json ├── inference.pdiparams ├── inference.yml diff --git a/docs/version3.x/module_usage/text_detection.en.md b/docs/version3.x/module_usage/text_detection.en.md index e5928e757cbf2a137d97c4080ca41e44aa6454a2..fbcbe1cfb0f9b7e771bf815f714bbb660be0280d 100644 --- a/docs/version3.x/module_usage/text_detection.en.md +++ b/docs/version3.x/module_usage/text_detection.en.md @@ -113,7 +113,7 @@ You can also integrate the model inference into your project. Before running the ```python from paddleocr import TextDetection -model = TextDetection(model_name="PP-OCRv5_mobile_det") +model = TextDetection(model_name="PP-OCRv5_server_det") output = model.predict("general_ocr_001.png", batch_size=1) for res in output: res.print() @@ -130,9 +130,9 @@ The output will be: ..., - [[ 37, 408], + [[ 31, 406], ..., - [ 39, 453]]], dtype=int16), 'dt_scores': [0.832930755107492, 0.8186143846140158, 0.8591595100376676, 0.8718863959111733]}} + [ 34, 455]]], dtype=int16), 'dt_scores': [0.873949039891189, 0.8948166013613552, 0.8842595305917041, 0.876953790920377]}} ``` Output parameter meanings: @@ -147,87 +147,114 @@ Visualization example: Method and parameter descriptions: -* Instantiate the text detection model (e.g., `PP-OCRv5_mobile_det`): +* Instantiate the text detection model (e.g., `PP-OCRv5_server_det`): - + - + - - + - - + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - - - - - - - - + + + - - - - + + + +
                Parameter Description TypeOptions Default
                model_nameModel nameModel name. All supported seal text detection model names, such as PP-OCRv5_mobile_det. strAll PaddleX-supported text detection model namesRequiredNone
                model_dir Model storage path strN/AN/ANone
                deviceInference deviceDevice(s) to use for inference.
                +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
                +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
                +By default, GPU 0 will be used if available; otherwise, the CPU will be used. +
                strGPU (e.g., "gpu:0"), NPU (e.g., "npu:0"), CPU ("cpu")gpu:0None
                enable_hpiWhether to use the high performance inference.boolFalse
                use_tensorrtWhether to use the Paddle Inference TensorRT subgraph engine.boolFalse
                min_subgraph_sizeMinimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine.int3
                precisionPrecision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
                Options: fp32, fp16, etc.
                strfp32
                enable_mkldnn +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +boolTrue
                cpu_threadsNumber of threads to use for inference on CPUs.int10
                limit_side_lenImage side length limit for detectionint/NonePositive integer or None (uses default model config)NoneLimit on the side length of the input image for detection. int specifies the value. If set to None, the default value from the official PaddleOCR model configuration will be used.int / NoneNone
                limit_typeSide length restriction typestr/None"min" (shortest side ≥ limit) or "max" (longest side ≤ limit)NoneType of image side length limitation. "min" ensures the shortest side of the image is no less than det_limit_side_len; "max" ensures the longest side is no greater than limit_side_len. If set to None, the default value from the official PaddleOCR model configuration will be used.str / NoneNone
                threshPixel score threshold for text detectionfloat/NonePositive float or None (uses default model config)NonePixel score threshold. Pixels in the output probability map with scores greater than this threshold are considered text pixels. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used.float / NoneNone
                box_threshAverage score threshold for text regionsfloat/NonePositive float or None (uses default model config)NoneIf the average score of all pixels inside the bounding box is greater than this threshold, the result is considered a text region. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used.float / NoneNone
                unclip_ratioExpansion coefficient for Vatti clipping algorithmfloat/NonePositive float or None (uses default model config)None
                use_hpipEnable high-performance inference pluginboolN/AFalseExpansion ratio for the Vatti clipping algorithm, used to expand the text region. Accepts any float value greater than 0. If set to None, the default value from the official PaddleOCR model configuration will be used.float / NoneNone
                hpi_configHigh-performance inference configurationdict | NoneN/Ainput_shapeInput image size for the model in the format (C, H, W). If set to None, the model's default size will be used.tuple / None None
                * The `predict()` method parameters: @@ -237,61 +264,65 @@ Method and parameter descriptions: Parameter Description Type -Options Default + input -Input data (image path, URL, directory, or list) -Python Var/str/dict/list -Numpy array, file path, URL, directory, or list of these -Required + +Input data to be predicted. Required. Supports multiple input types: +
                  +
                • Python variable: e.g., numpy.ndarray representing image data
                • +
                • File path: e.g., local image file path /root/data/img.jpg
                • +
                • URL: e.g., image file URL: Example
                • +
                • Directory: should contain image files for prediction (PDF files are not supported)
                • +
                • List: contains elements of the above types, e.g., [numpy.ndarray, "/root/data/img.jpg"]
                • +
                + +Python Var / str / dict / list + batch_size -Batch size +Batch size, positive integer. int -Positive integer 1 limit_side_len -Image side length limit for detection -int/None -Positive integer or None (uses model default) -None +Limit on the side length of the input image for detection. int specifies the value. If set to None, the parameter value initialized by the model will be used by default. +int / None +None limit_type -Side length restriction type -str/None -"min" or "max" -None +Type of image side length limitation. "min" ensures the shortest side of the image is no less than det_limit_side_len; "max" ensures the longest side is no greater than limit_side_len. If set to None, the parameter value initialized by the model will be used by default. +str / None +None thresh -Pixel score threshold for text detection -float/None -Positive float or None (uses model default) -None +Pixel score threshold. Pixels in the output probability map with scores greater than this threshold are considered text pixels. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default. +float / None +None box_thresh -Average score threshold for text regions -float/None -Positive float or None (uses model default) -None +If the average score of all pixels inside the bounding box is greater than this threshold, the result is considered a text region. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default. +float / None +None unclip_ratio -Expansion coefficient for Vatti clipping algorithm -float/None -Positive float or None (uses model default) -None +Expansion ratio for the Vatti clipping algorithm, used to expand the text region. Accepts any float value greater than 0. If set to None, the parameter value initialized by the model will be used by default. +float / None +None + + * Result processing methods: @@ -404,18 +435,18 @@ Training command: python3 tools/train.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ Train.dataset.data_dir=./ocr_det_dataset_examples \ - Train.dataset.label_file_list=[./ocr_det_dataset_examples/train.txt] \ + Train.dataset.label_file_list='[./ocr_det_dataset_examples/train.txt]' \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' # Multi-GPU training (specify GPUs with --gpus) python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ Train.dataset.data_dir=./ocr_det_dataset_examples \ - Train.dataset.label_file_list=[./ocr_det_dataset_examples/train.txt] \ + Train.dataset.label_file_list='[./ocr_det_dataset_examples/train.txt]' \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' ``` ### 4.3 Model Evaluation @@ -428,7 +459,7 @@ You can evaluate trained weights (e.g., `output/PP-OCRv5_server_det/best_accurac python3 tools/eval.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=output/PP-OCRv5_server_det/best_accuracy.pdparams \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' ``` ### 4.4 Model Export diff --git a/docs/version3.x/module_usage/text_detection.md b/docs/version3.x/module_usage/text_detection.md index a672b462a9fd8acc245c38aa6d90cccc6ea80171..713a1ffdaa049b511550c6692e981f15401230c3 100644 --- a/docs/version3.x/module_usage/text_detection.md +++ b/docs/version3.x/module_usage/text_detection.md @@ -114,7 +114,7 @@ paddleocr text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/i ```python from paddleocr import TextDetection -model = TextDetection(model_name="PP-OCRv5_mobile_det") +model = TextDetection(model_name="PP-OCRv5_server_det") output = model.predict("general_ocr_001.png", batch_size=1) for res in output: res.print() @@ -131,9 +131,9 @@ for res in output: ..., - [[ 37, 408], + [[ 31, 406], ..., - [ 39, 453]]], dtype=int16), 'dt_scores': [0.832930755107492, 0.8186143846140158, 0.8591595100376676, 0.8718863959111733]}} + [ 34, 455]]], dtype=int16), 'dt_scores': [0.873949039891189, 0.8948166013613552, 0.8842595305917041, 0.876953790920377]}} ``` 运行结果参数含义如下: @@ -148,111 +148,119 @@ for res in output: 相关方法、参数等说明如下: -* `TextDetection`实例化文本检测模型(此处以`PP-OCRv5_mobile_det`为例),具体说明如下: +* `TextDetection`实例化文本检测模型(此处以`PP-OCRv5_server_det`为例),具体说明如下:
                - + - + - - + - - + - + - - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
                + + + + + + + + + + + + + + + - - - - - - + + + - - - - - + + + - - - - - + + + - - - - - - - - - - - - + + + - - - - + + + +
                参数 参数说明 参数类型可选项 默认值
                model_name模型名称模型名称。所有支持的文本检测模型名称,如 PP-OCRv5_mobile_det str所有支持的文本检测模型名称None
                model_dir 模型存储路径 strNone
                device模型推理设备用于推理的设备。
                +例如:cpugpunpugpu:0gpu:0,1
                +如指定多个设备,将进行并行推理。
                +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 +
                str支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。gpu:0None
                limit_side_len检测的图像边长限制int/Noneenable_hpi是否启用高性能推理。boolFalse
                use_tensorrt是否启用 Paddle Inference 的 TensorRT 子图引擎。boolFalse
                min_subgraph_size当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。int3
                precision当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
                可选项:fp32fp16 等。
                strfp32
                enable_mkldnn -
                  -
                • int: 大于0的任意整数
                NoneboolTrue
                cpu_threads在 CPU 上推理时使用的线程数量。int10
                limit_side_len检测的图像边长限制:int 表示边长限制数值,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。int / NoneNone
                limit_type检测的图像边长限制,检测的边长限制类型 str/None -
                  -
                • str: 支持min和max. min表示保证图像最短边不小于det_limit_side_len, max: 表示保证图像最长边不大于limit_side_len。
                None检测的图像边长限制,检测的边长限制类型,"min" 表示保证图像最短边不小于det_limit_side_len,"max"表示保证图像最长边不大于limit_side_len。如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。str / NoneNone
                thresh输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 float/None -
                  -
                • float: 大于0的任意浮点数 -
                None像素得分阈值。输出概率图中得分大于该阈值的像素点被认为是文本像素。可选大于0的float任意浮点数,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
                box_thresh检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 float/None -
                  -
                • float: 大于0的任意浮点数 -
                None检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。可选大于0的float任意浮点数,如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
                unclip_ratioVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张 float/None -
                  -
                • float: 大于0的任意浮点数 -
                None
                use_hpip是否启用高性能推理插件boolFalseVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张。可选大于0的任意浮点数。如果设置为None, 将默认使用PaddleOCR官方模型配置中的该参数值。float / NoneNone
                hpi_config高性能推理配置dict | Noneinput_shape模型输入图像尺寸,格式为 (C, H, W)。若为 None 将默认使用PaddleOCR官方模型配置中的该参数值。tuple / None None
                * 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用 PaddleX 内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 -* 调用文本检测模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input`、 `batch_size`、 `limit_side_len`、 `limit_type`、 `thresh`、 `box_thresh`、 `max_candidates`、`unclip_ratio`和`use_dilation`,具体说明如下: +* 调用文本检测模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input`、 `batch_size`、 `limit_side_len`、 `limit_type`、 `thresh`、 `box_thresh`、 `max_candidates`、`unclip_ratio`,具体说明如下: @@ -260,88 +268,58 @@ for res in output: - - - - - + + - + - - - - - - + + + - - - - - - + + + - - - - - + + + - - - - - + + + - - - - - + + + +
                参数 参数说明 参数类型可选项 默认值
                input待预测数据,支持多种输入类型Python Var/str/dict/list +待预测数据,支持多种输入类型,必填。
                  -
                • Python变量,如numpy.ndarray表示的图像数据
                • -
                • 文件路径,如图像文件的本地路径:/root/data/img.jpg
                • -
                • URL链接,如图像文件的网络URL:示例
                • -
                • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
                • -
                • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                • +
                • Python Var:如 numpy.ndarray 表示的图像数据
                • +
                • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
                • +
                • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                Python Var|str|list
                batch_size批大小批大小,可设置为任意正整数。 int大于0的任意整数 1
                limit_side_len检测的图像边长限制int/None -
                  -
                • int: 大于0的任意整数 -
                • None: 如果设置为None, 将默认使用模型初始化的该参数值
                None检测的图像边长限制:int 表示边长限制数值,如果设置为None, 如果设置为None, 将默认使用模型初始化的该参数值。int / NoneNone
                limit_type检测的图像边长限制,检测的边长限制类型 str/None -
                  -
                • str: 支持min和max. min表示保证图像最短边不小于det_limit_side_len, max: 表示保证图像最长边不大于limit_side_len -
                • None: 如果设置为None, 将默认使用模型初始化的该参数值
                None检测的图像边长限制,检测的边长限制类型,"min" 表示保证图像最短边不小于det_limit_side_len,"max"表示保证图像最长边不大于limit_side_len。如果设置为None, 将默认使用模型初始化的该参数值。str / NoneNone
                thresh输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 float/None -
                  -
                • float: 大于0的任意浮点数 -
                • None: 如果设置为None, 将默认使用模型初始化的该参数值
                None像素得分阈值。输出概率图中得分大于该阈值的像素点被认为是文本像素。可选大于0的float任意浮点数,如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
                box_thresh检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 float/None -
                  -
                • float: 大于0的任意浮点数 -
                • None: 如果设置为None, 将默认使用模型初始化的该参数值
                None检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。可选大于0的float任意浮点数,如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
                unclip_ratioVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张 float/None -
                  -
                • float: 大于0的任意浮点数 -
                • None: 如果设置为None, 将默认使用模型初始化的该参数值
                NoneVatti clipping算法的扩张系数,使用该方法对文字区域进行扩张。可选大于0的任意浮点数。如果设置为None, 将默认使用模型初始化的该参数值。float / NoneNone
                * 对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为图片、保存为`json`文件的操作: @@ -460,18 +438,18 @@ PaddleOCR 对代码进行了模块化,训练 `PP-OCRv5_server_det` 识别模 python3 tools/train.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ Train.dataset.data_dir=./ocr_det_dataset_examples \ - Train.dataset.label_file_list=[./ocr_det_dataset_examples/train.txt] \ + Train.dataset.label_file_list='[./ocr_det_dataset_examples/train.txt]' \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' #多卡训练,通过--gpus参数指定卡号 python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=./PP-OCRv5_server_det_pretrained.pdparams \ Train.dataset.data_dir=./ocr_det_dataset_examples \ - Train.dataset.label_file_list=[./ocr_det_dataset_examples/train.txt] \ + Train.dataset.label_file_list='[./ocr_det_dataset_examples/train.txt]' \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' ``` ### 4.3 模型评估 @@ -484,7 +462,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ python3 tools/eval.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml \ -o Global.pretrained_model=output/PP-OCRv5_server_det/best_accuracy.pdparams \ Eval.dataset.data_dir=./ocr_det_dataset_examples \ - Eval.dataset.label_file_list=[./ocr_det_dataset_examples/val.txt] + Eval.dataset.label_file_list='[./ocr_det_dataset_examples/val.txt]' ``` ### 4.4 模型导出 diff --git a/docs/version3.x/module_usage/text_image_unwarping.en.md b/docs/version3.x/module_usage/text_image_unwarping.en.md index 92aad3825cfb0e6a468f3f10110030e4da4a8c56..546bc659ef532571573f5bf8149c4c135b33a9d9 100644 --- a/docs/version3.x/module_usage/text_image_unwarping.en.md +++ b/docs/version3.x/module_usage/text_image_unwarping.en.md @@ -119,45 +119,71 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default + model_name -Model Name +Name of the model str -All model names supported by PaddleX -None +None model_dir -Model Storage Path +Model storage path str -None -None +None device -Model Inference Device +Device(s) to use for inference.
                +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
                +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
                +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU card numbers, such as “gpu:0”, specific hardware card numbers, such as “npu:0”, CPU as “cpu”. -gpu:0 +None -use_hpip -Whether to enable high-performance inference plugin +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-Performance Inference Configuration -dict | None -None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
                Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + * Among them, `model_name` must be specified. After specifying `model_name`, the default model parameters built into PaddleX are used. When `model_dir` is specified, the user-defined model is used. @@ -170,30 +196,28 @@ The relevant methods, parameters, etc., are described as follows: Parameter Description Type -Options -Default Value +Default input -Data to be predicted, supports multiple input types -Python Var/str/dict/list - +Input data to be predicted. Required. Supports multiple input types:
                  -
                • Python Variable, such as numpy.ndarray representing image data
                • -
                • File Path, such as the local path of an image file: /root/data/img.jpg
                • -
                • URL Link, such as the network URL of an image file: Example
                • -
                • Local Directory, which should contain data files to be predicted, such as the local path: /root/data/
                • -
                • List, where list elements must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
                • +
                • Python Var: e.g., numpy.ndarray representing image data
                • +
                • str: + - Local image or PDF file path: /root/data/img.jpg; + - URL of image or PDF file: e.g., example; + - Local directory: directory containing images for prediction, e.g., /root/data/ (Note: directories containing PDF files are not supported; PDFs must be specified by exact file path)
                • +
                • List: Elements must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
                -None +Python Var|str|list + batch_size -Batch Size +Batch size, positive integer. int -Any integer 1 diff --git a/docs/version3.x/module_usage/text_image_unwarping.md b/docs/version3.x/module_usage/text_image_unwarping.md index f86e849a37b2b5c05e59b2da87f180d089e542f3..7485a36c2f9f664a41cbada37a26bc6c05495622 100644 --- a/docs/version3.x/module_usage/text_image_unwarping.md +++ b/docs/version3.x/module_usage/text_image_unwarping.md @@ -119,45 +119,71 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 + model_name 模型名称 str -所有支持的模型名称 -无 +None model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
                +例如:cpugpunpugpu:0gpu:0,1
                +如指定多个设备,将进行并行推理。
                +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置 -dict | None -无 -None +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False + + +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
                可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn + +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
                + +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + * 其中,`model_name` 必须指定,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 @@ -170,30 +196,25 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 input -待预测数据,支持多种输入类型 -Python Var/str/dict/list - +待预测数据,支持多种输入类型,必填。
                  -
                • Python变量,如numpy.ndarray表示的图像数据
                • -
                • 文件路径,如图像文件的本地路径:/root/data/img.jpg
                • -
                • URL链接,如图像文件的网络URL:示例
                • -
                • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
                • -
                • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                • +
                • Python Var:如 numpy.ndarray 表示的图像数据
                • +
                • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
                • +
                • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 diff --git a/docs/version3.x/module_usage/text_line_orientation_classification.en.md b/docs/version3.x/module_usage/text_line_orientation_classification.en.md deleted file mode 100644 index 45dbef9cdca7927aecff1a322c3644a63b29d283..0000000000000000000000000000000000000000 --- a/docs/version3.x/module_usage/text_line_orientation_classification.en.md +++ /dev/null @@ -1,220 +0,0 @@ ---- -comments: true ---- - -# Text Line Orientation Classification Module Tutorial - -## 1. Overview -The text line orientation classification module identifies the orientation of text lines and corrects them through post-processing. During processes like document scanning or ID photo capture, users may rotate the shooting device for better clarity, resulting in text lines with varying orientations. Standard OCR workflows often struggle with such data. By employing image classification technology, this module pre-determines text line orientation and adjusts it, thereby enhancing OCR accuracy. - -## 2. Supported Models - - - - - - - - - - - - - - - - - - - - - - - -
                ModelDownload LinksTop-1 Acc (%)GPU Inference Time (ms)CPU Inference Time (ms)Model Size (M)Description
                PP-LCNet_x0_25_textline_oriInference Model/Training Model95.54--0.32A text line classification model based on PP-LCNet_x0_25, with two classes: 0° and 180°.
                - -Testing Environment: - -
                  -
                • Performance Testing Environment -
                    -
                  • Test Dataset: PaddleX's proprietary dataset, covering scenarios like IDs and documents, with 1,000 images.
                  • -
                  • Hardware: -
                      -
                    • GPU: NVIDIA Tesla T4
                    • -
                    • CPU: Intel Xeon Gold 6271C @ 2.60GHz
                    • -
                    • Other: Ubuntu 20.04 / cuDNN 8.6 / TensorRT 8.5.2.2
                    • -
                    -
                  • -
                  -
                • -
                • Inference Mode Description
                • -
                - - - - - - - - - - - - - - - - - - - - - - - - - -
                ModeGPU ConfigurationCPU ConfigurationAcceleration Techniques
                Standard ModeFP32 precision / No TRT accelerationFP32 precision / 8 threadsPaddleInference
                High-Performance ModeOptimal combination of precision and acceleration strategiesFP32 precision / 8 threadsOptimal backend selection (Paddle/OpenVINO/TRT, etc.)
                - -## 3. Quick Start - -> ❗ Before starting, ensure you have installed the PaddleOCR wheel package. Refer to the [Installation Guide](../installation.en.md) for details. - -Run the following command for a quick demo: - -```bash -paddleocr text_line_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg -``` - -Alternatively, integrate the module into your project. Download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg) locally before running the code below. - -```python -from paddleocr import TextLineOrientationClassification -model = TextLineOrientationClassification(model_name="PP-LCNet_x0_25_textline_ori") -output = model.predict("textline_rot180_demo.jpg", batch_size=1) -for res in output: - res.print(json_format=False) - res.save_to_img("./output/demo.png") - res.save_to_json("./output/res.json") -``` - -The output will be: - -```bash -{'res': {'input_path': 'textline_rot180_demo.jpg', 'page_index': None, 'class_ids': array([1], dtype=int32), 'scores': array([1.], dtype=float32), 'label_names': ['180_degree']}} -``` - -Key output fields: -- `input_path`: Path of the input image. -- `page_index`: For PDF inputs, indicates the page number; otherwise, `None`. -- `class_ids`: Predicted class IDs (0° or 180°). -- `scores`: Confidence scores. -- `label_names`: Predicted class labels. - -Visualization: - - - -### Method and Parameter Details - -* **`TextLineOrientationClassification` Initialization** (using `PP-LCNet_x0_25_textline_ori` as an example): - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                ParameterDescriptionTypeOptionsDefault
                model_nameModel namestrN/ANone
                model_dirCustom model pathstrN/ANone
                deviceInference devicestrE.g., "gpu:0", "npu:0", "cpu"gpu:0
                use_hpipEnable high-performance inferenceboolN/AFalse
                hpi_configHPI configurationdict | NoneN/ANone
                - -* **`predict()` Method**: - - `input`: Supports various input types (numpy array, file path, URL, directory, or list). - - `batch_size`: Batch size (default: 1). - -* **Result Handling**: - Each prediction result is a `Result` object with methods like `print()`, `save_to_img()`, and `save_to_json()`. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                MethodDescriptionParametersTypeDetailsDefault
                print()Print resultsformat_json, indent, ensure_asciibool, int, boolControl JSON formatting and ASCII escapingTrue, 4, False
                save_to_json()Save results as JSONsave_path, indent, ensure_asciistr, int, boolSame as print()N/A, 4, False
                save_to_img()Save visualized resultssave_pathstrOutput pathN/A
                - -* **Attributes**: - - `json`: Get results in JSON format. - - `img`: Get visualized images as a dictionary. - -## 4. Custom Development - -Since PaddleOCR does not natively support training for text line orientation classification, refer to [PaddleX's Custom Development Guide](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#iv-custom-development) for training. Trained models can seamlessly integrate into PaddleOCR's API for inference. diff --git a/docs/version3.x/module_usage/text_recognition.en.md b/docs/version3.x/module_usage/text_recognition.en.md index 407351c9273127299f708ecdd76977b5e7de117f..d72907e0264251addd369db667dc6a9644187f11 100644 --- a/docs/version3.x/module_usage/text_recognition.en.md +++ b/docs/version3.x/module_usage/text_recognition.en.md @@ -446,45 +446,77 @@ The descriptions of relevant methods and parameters are as follows: Parameter Description Type -Options -Default Value +Default + model_name -Model name +Name of the model str -All model names supported by PaddleX -None +None model_dir Model storage path str -None -None +None device -Model inference device +Device(s) to use for inference.
                +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
                +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
                +By default, GPU 0 will be used if available; otherwise, the CPU will be used. + str -Supports specifying specific GPU card numbers, such as "gpu:0", specific card numbers for other hardware, such as "npu:0", and "cpu" for CPU. -gpu:0 +None -use_hpip -Whether to enable the high-performance inference plugin +enable_hpi +Whether to use the high performance inference. bool -None False -hpi_config -High-performance inference configuration -dict | None -None +use_tensorrt +Whether to use the Paddle Inference TensorRT subgraph engine. +bool +False + + +min_subgraph_size +Minimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine. +int +3 + + +precision +Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
                Options: fp32, fp16, etc. +str +fp32 + + +enable_mkldnn + +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. + +bool +True + + +cpu_threads +Number of threads to use for inference on CPUs. +int +10 + + +input_shape +Input image size for the model in the format (C, H, W). If set to None, the model's default size will be used. +tuple / None None + * Among them, `model_name` must be specified. After specifying `model_name`, the default model parameters built into PaddleX are used. On this basis, when `model_dir` is specified, the user-defined model is used. @@ -518,7 +550,7 @@ The descriptions of relevant methods and parameters are as follows: batch_size -Batch size +Batch size, positive integer. int Any integer 1 diff --git a/docs/version3.x/module_usage/text_recognition.md b/docs/version3.x/module_usage/text_recognition.md index 19a1a8b1bea4c1a8ba01ac64003dad69da8dc649..44c4740f9eebad55c8a7ed7e2fef69a777905880 100644 --- a/docs/version3.x/module_usage/text_recognition.md +++ b/docs/version3.x/module_usage/text_recognition.md @@ -39,26 +39,26 @@ PP-OCRv5_mobile_rec_infer.tar">推理模型/推理模型/训练模型 -86.58 +81.53 6.65 / 2.38 32.92 / 32.92 -181 M +74.7 M PP-OCRv4_server_rec_doc是在PP-OCRv4_server_rec的基础上,在更多中文文档数据和PP-OCR训练数据的混合数据训练而成,增加了部分繁体字、日文、特殊字符的识别能力,可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力 PP-OCRv4_mobile_rec推理模型/训练模型 -83.28 +78.74 4.82 / 1.20 16.74 / 4.64 -88 M +10.6 M PP-OCRv4的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中 PP-OCRv4_server_rec 推理模型/训练模型 -85.19 +80.61 6.58 / 2.43 33.17 / 33.17 -151 M +71.2 M PP-OCRv4的服务器端模型,推理精度高,可以部署在多种不同的服务器上 @@ -67,12 +67,12 @@ en_PP-OCRv4_mobile_rec_infer.tar">推理模型/推理模型/推理模型/训练模型 -86.58 +81.53 6.65 / 2.38 32.92 / 32.92 -181 M +74.7 M PP-OCRv4_server_rec_doc是在PP-OCRv4_server_rec的基础上,在更多中文文档数据和PP-OCR训练数据的混合数据训练而成,增加了部分繁体字、日文、特殊字符的识别能力,可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力 PP-OCRv4_mobile_rec推理模型/训练模型 -83.28 +78.74 4.82 / 1.20 16.74 / 4.64 -88 M +10.6 M PP-OCRv4的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中 PP-OCRv4_server_rec 推理模型/训练模型 -85.19 +80.61 6.58 / 2.43 33.17 / 33.17 -151 M +71.2 M PP-OCRv4的服务器端模型,推理精度高,可以部署在多种不同的服务器上 PP-OCRv3_mobile_rec推理模型/训练模型 -75.43 +72.96 5.87 / 1.19 9.07 / 4.28 -138 M +9.2 M PP-OCRv3的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中 @@ -175,7 +175,7 @@ PP-OCRv3_mobile_rec_infer.tar">推理模型/ SVTRv2 是一种由复旦大学视觉与学习实验室(FVL)的OpenOCR团队研发的服务端文本识别模型,其在PaddleOCR算法模型挑战赛 - 赛题一:OCR端到端识别任务中荣获一等奖,A榜端到端识别精度相比PP-OCRv4提升6%。 @@ -196,7 +196,7 @@ SVTRv2 是一种由复旦大学视觉与学习实验室(FVL)的OpenOCR团队 65.07 5.93 / 1.62 20.73 / 7.32 -70 M +22.1 M RepSVTR 文本识别模型是一种基于SVTRv2 的移动端文本识别模型,其在PaddleOCR算法模型挑战赛 - 赛题一:OCR端到端识别任务中荣获一等奖,B榜端到端识别精度相比PP-OCRv4提升2.5%,推理速度持平。 @@ -217,7 +217,7 @@ en_PP-OCRv4_mobile_rec_infer.tar">推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/推理模型/示例 -
              • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
              • -
              • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
              • +
              • Python Var:如 numpy.ndarray 表示的图像数据
              • +
              • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
              • +
              • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
              -无 +Python Var|str|list + batch_size -批大小 +批大小,可设置为任意正整数。 int -任意整数 1 diff --git a/docs/version3.x/module_usage/textline_orientation_classification.en.md b/docs/version3.x/module_usage/textline_orientation_classification.en.md new file mode 100644 index 0000000000000000000000000000000000000000..18a4c1b309df5d8e4c432bcc92e333863a85a1fa --- /dev/null +++ b/docs/version3.x/module_usage/textline_orientation_classification.en.md @@ -0,0 +1,364 @@ +--- +comments: true +--- + +# Text Line Orientation Classification Module Tutorial + +## 1. Overview +The text line orientation classification module primarily distinguishes the orientation of text lines and corrects them using post-processing. In processes such as document scanning and license/certificate photography, to capture clearer images, the capture device may be rotated, resulting in text lines in various orientations. Standard OCR pipelines cannot handle such data well. By utilizing image classification technology, the orientation of text lines can be predetermined and adjusted, thereby enhancing the accuracy of OCR processing. + +## 2. Supported Model List + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              ModelModel Download LinkTop-1 Accuracy (%)GPU Inference Time (ms)
              [Normal Mode / High-Performance Mode]
              CPU Inference Time (ms)Model Size (M)Description
              PP-LCNet_x0_25_textline_oriInference Model/Training Model98.85--0.96Text line classification model based on PP-LCNet_x0_25, with two classes: 0 degrees and 180 degrees
              PP-LCNet_x1_0_textline_oriInference Model/Training Model99.42--6.5Text line classification model based on PP-LCNet_x1_0, with two classes: 0 degrees and 180 degrees
              + +> ❗ **Note**: The text line orientation classification model was upgraded on May 26, 2025, and `PP-LCNet_x1_0_textline_ori` has been added. If you need to use the pre-upgrade model weights, please click the download link. + +Test Environment Description: + +
                +
              • Performance Test Environment +
                  +
                • Test Dataset: PaddleX Self-built Dataset, Covering Multiple Scenarios Such as Documents and Certificates, Containing 1000 Images.
                • +
                • Hardware Configuration: +
                    +
                  • GPU: NVIDIA Tesla T4
                  • +
                  • CPU: Intel Xeon Gold 6271C @ 2.60GHz
                  • +
                  • Other Environments: Ubuntu 20.04 / cuDNN 8.6 / TensorRT 8.5.2.2
                  • +
                  +
                • +
                +
              • +
              • Inference Mode Description
              • +
              + + + + + + + + + + + + + + + + + + + + + + + + +
              ModeGPU Configuration CPU Configuration Acceleration Technology Combination
              Normal ModeFP32 Precision / No TRT AccelerationFP32 Precision / 8 ThreadsPaddleInference
              High-Performance ModeOptimal combination of pre-selected precision types and acceleration strategiesFP32 Precision / 8 ThreadsPre-selected optimal backend (Paddle/OpenVINO/TRT, etc.)
              + +## 3. Quick Integration + +> ❗ Before starting, please install the wheel package of PaddleOCR. For detailed instructions, refer to the [Installation Guide](../installation.en.md). + +You can quickly experience the functionality with a single command: + +```bash +paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg +``` + +You can also integrate the text line orientation classification model into your project. Run the following code after downloading the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg) to your local machine. + +```bash +from paddleocr import TextLineOrientationClassification +model = TextLineOrientationClassification(model_name="PP-LCNet_x0_25_textline_ori") +output = model.predict("textline_rot180_demo.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/demo.png") + res.save_to_json("./output/res.json") +``` + +After running, the result obtained is: + +```bash +{'res': {'input_path': 'textline_rot180_demo.jpg', 'page_index': None, 'class_ids': array([1], dtype=int32), 'scores': array([0.99864], dtype=float32), 'label_names': ['180_degree']}} +``` + +The meanings of the running results parameters are as follows: + +- `input_path`:Indicates the path of the input image. +- `page_index`:If the input is a PDF file, it indicates the current page number of the PDF; otherwise, it is `None`. +- `class_ids`:Indicates the class ID of the prediction result. +- `scores`:Indicates the confidence score of the prediction result. +- `label_names`:Indicates the class name of the prediction result. +The visualization image is as follows: + + + +The explanations for the methods, parameters, etc., are as follows: + +* `TextLineOrientationClassification` instantiates a textline classification model (here, `PP-LCNet_x0_25_textline_ori` is used as an example), and the specific explanations are as follows: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              ParameterDescriptionTypeDefault
              model_nameName of the modelstrNone
              model_dirModel storage pathstrNone
              deviceDevice(s) to use for inference.
              +Examples: cpu, gpu, npu, gpu:0, gpu:0,1.
              +If multiple devices are specified, inference will be performed in parallel. Note that parallel inference is not always supported.
              +By default, GPU 0 will be used if available; otherwise, the CPU will be used. +
              strNone
              enable_hpiWhether to use the high performance inference.boolFalse
              use_tensorrtWhether to use the Paddle Inference TensorRT subgraph engine.boolFalse
              min_subgraph_sizeMinimum subgraph size for TensorRT when using the Paddle Inference TensorRT subgraph engine.int3
              precisionPrecision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
              Options: fp32, fp16, etc.
              strfp32
              enable_mkldnn +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +boolTrue
              cpu_threadsNumber of threads to use for inference on CPUs.int10
              top_kThe top-k value for prediction results. If not specified, the default value in the official PaddleOCR model configuration is used. If the value is 5, the top 5 categories and their corresponding classification probabilities will be returned.intNone
              + +* `model_name` must be specified. Once `model_name` is set, the default built-in model parameters of PaddleOCR will be used. On this basis, if `model_dir` is specified, the user-defined model will be used. + +* Use the `predict()` method of the text line direction classification model to perform inference. This method returns a list of results. In addition, this module also provides the `predict_iter()` method. Both methods accept the same parameters and return the same result format. The difference is that `predict_iter()` returns a `generator`, which processes and retrieves prediction results step by step. It is suitable for handling large datasets or memory-efficient scenarios. You can choose either method based on your actual needs. The `predict()` method accepts the parameters `input` and `batch_size`, which are described in detail below: + + + + + + + + + + + + + + + + + + + + + +
              ParameterDescriptionTypeDefault
              inputInput data for prediction. Multiple input types are supported. This parameter is required. +
                +
              • Python Var: such as numpy.ndarray representing image data
              • +
              • str: such as the local path of an image or PDF file: /root/data/img.jpg; or a URL link, such as the online URL of an image or PDF file: Example; or a local directory that contains images for prediction, such as /root/data/ (currently, directories containing PDF files are not supported; PDF files must be specified as individual file paths)
              • +
              • List: list elements must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
              • +
              +
              Python Var|str|list
              batch_sizeBatch size, positive integer.int1
              + +* Call the `predict()` method of the text line orientation classification model for inference. This method will return a list of results. In addition, this module also provides a `predict_iter()` method. Both methods accept the same parameters and return the same results, but `predict_iter()` returns a `generator`, which is more suitable for processing large datasets or when you want to save memory. You can choose either method according to your needs. The parameters of the `predict()` method are `input` and `batch_size`, as described below: + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              ParameterParameter DescriptionParameter TypeOptionsDefault Value
              inputData to be predicted, supporting multiple input typesPython Var/str/list +
                +
              • Python variable, such as image data represented by numpy.ndarray
              • +
              • File path, such as the local path of an image file: /root/data/img.jpg
              • +
              • URL link, such as the network URL of an image file: Example
              • +
              • Local directory, the directory should contain data files to be predicted, such as the local path: /root/data/
              • +
              • List, the elements of the list should be of the above-mentioned data types, such as [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
              • +
              +
              None
              batch_sizeBatch sizeintAny integer1
              + +* The prediction results are processed, and the prediction result for each sample is of type `dict`. It supports operations such as printing, saving as an image, and saving as a `json` file: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              MethodMethod DescriptionParameterParameter TypeParameter DescriptionDefault Value
              print()Print the results to the terminalformat_jsonboolWhether to format the output content using JSON indentationTrue
              indentintSpecify the indentation level to beautify the output JSON data, making it more readable, only effective when format_json is True4
              ensure_asciiboolControl whether to escape non-ASCII characters to Unicode. If set to True, all non-ASCII characters will be escaped; False retains the original characters, only effective when format_json is TrueFalse
              save_to_json()Save the results as a JSON filesave_pathstrThe path to save the file. If it is a directory, the saved file name will be consistent with the input file nameNone
              indentintSpecify the indentation level to beautify the output JSON data, making it more readable, only effective when format_json is True4
              ensure_asciiboolControl whether to escape non-ASCII characters to Unicode. If set to True, all non-ASCII characters will be escaped; False retains the original characters, only effective when format_json is TrueFalse
              save_to_img()Save the results as an image filesave_pathstrThe path to save the file. If it is a directory, the saved file name will be consistent with the input file nameNone
              + +* Additionally, it supports obtaining the visualization image with results and the prediction results through attributes, as follows: + + + + + + + + + + + + + + + + +
              AttributeAttribute Description
              jsonGet the prediction result in json format
              imgGet the visualization image in dict format
              + +## 4. Custom Development + +Since PaddleOCR does not natively support training for text line orientation classification, refer to [PaddleX's Custom Development Guide](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#iv-custom-development) for training. Trained models can seamlessly integrate into PaddleOCR's API for inference. diff --git a/docs/version3.x/module_usage/text_line_orientation_classification.md b/docs/version3.x/module_usage/textline_orientation_classification.md similarity index 68% rename from docs/version3.x/module_usage/text_line_orientation_classification.md rename to docs/version3.x/module_usage/textline_orientation_classification.md index b88d009839f138ca41883111b29a5dffc4be4f82..30a56c4fca7c0b01bf9795f572306aa0e2317b4a 100644 --- a/docs/version3.x/module_usage/text_line_orientation_classification.md +++ b/docs/version3.x/module_usage/textline_orientation_classification.md @@ -26,15 +26,25 @@ comments: true PP-LCNet_x0_25_textline_ori推理模型/训练模型 -95.54 +98.85 - - -0.32 +0.96 基于PP-LCNet_x0_25的文本行分类模型,含有两个类别,即0度,180度 + +PP-LCNet_x1_0_textline_ori推理模型/训练模型 +99.42 +- +- +6.5 +基于PP-LCNet_x1_0的文本行分类模型,含有两个类别,即0度,180度 + +> ❗ :文本行方向分类模型于 2025.5.26 升级,并增加 `PP-LCNet_x1_0_textline_ori`,如需使用升级前的模型权重,请点击下载链接。 + 测试环境说明:
                @@ -86,7 +96,7 @@ comments: true 使用一行命令即可快速体验: ```bash -paddleocr text_line_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg +paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg ``` 您也可以将文本行方向分类模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg)到本地。 @@ -104,7 +114,7 @@ for res in output: 运行后,得到的结果为: ```bash -{'res': {'input_path': 'textline_rot180_demo.jpg', 'page_index': None, 'class_ids': array([1], dtype=int32), 'scores': array([1.], dtype=float32), 'label_names': ['180_degree']}} +{'res': {'input_path': 'textline_rot180_demo.jpg', 'page_index': None, 'class_ids': array([1], dtype=int32), 'scores': array([0.99864], dtype=float32), 'label_names': ['180_degree']}} ``` 运行结果参数含义如下: @@ -127,81 +137,111 @@ for res in output: 参数 参数说明 参数类型 -可选项 默认值 + model_name 模型名称 str -无 - +None model_dir 模型存储路径 str -无 -无 +None device -模型推理设备 +用于推理的设备。
                +例如:cpugpunpugpu:0gpu:0,1
                +如指定多个设备,将进行并行推理。
                +默认情况下,优先使用 GPU 0;若不可用则使用 CPU。 + str -支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 -gpu:0 +None -use_hpip -是否启用高性能推理插件 +enable_hpi +是否启用高性能推理。 bool -无 False -hpi_config -高性能推理配置 -dict | None -无 +use_tensorrt +是否启用 Paddle Inference 的 TensorRT 子图引擎。 +bool +False + + +min_subgraph_size +当使用 Paddle Inference 的 TensorRT 子图引擎时,设置的最小子图大小。 +int +3 + + +precision +当使用 Paddle Inference 的 TensorRT 子图引擎时设置的计算精度。
                可选项:fp32fp16 等。 +str +fp32 + + +enable_mkldnn + +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
                + +bool +True + + +cpu_threads +在 CPU 上推理时使用的线程数量。 +int +10 + + +top_k +预测结果的前topk值,如果不指定,将默认使用PaddleOCR官方模型配置。若值为5,表示打印(返回)预测结果的前5个类别和对应的分类概率 +int None + -* 其中,`model_name` 必须指定,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 + + +* 其中,`model_name` 必须指定,指定 `model_name` 后,默认使用 PaddleOCR 内置的模型参数,在此基础上,指定 `model_dir` 时,使用用户自定义的模型。 * 调用文本行方向分类模型的 `predict()` 方法进行推理预测,该方法会返回一个结果列表。另外,本模块还提供了 `predict_iter()` 方法。两者在参数接受和结果返回方面是完全一致的,区别在于 `predict_iter()` 返回的是一个 `generator`,能够逐步处理和获取预测结果,适合处理大型数据集或希望节省内存的场景。可以根据实际需求选择使用这两种方法中的任意一种。`predict()` 方法参数有 `input` 和 `batch_size`,具体说明如下: + - - - - - + + - + -
                参数 参数说明 参数类型可选项 默认值
                input待预测数据,支持多种输入类型Python Var/str/list +待预测数据,支持多种输入类型,必填。
                  -
                • Python变量,如numpy.ndarray表示的图像数据
                • -
                • 文件路径,如图像文件的本地路径:/root/data/img.jpg
                • -
                • URL链接,如图像文件的网络URL:示例
                • -
                • 本地目录,该目录下需包含待预测数据文件,如本地路径:/root/data/
                • -
                • 列表,列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                • +
                • Python Var:如 numpy.ndarray 表示的图像数据
                • +
                • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
                • +
                • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
                Python Var|str|list
                batch_size批大小批大小,可设置为任意正整数。 int任意整数 1
                diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md index ddc3c5e9a921af627bac83bb6ab8d5f345aae883..582e80cff991d9ac299ba830694527d4ceeebccc 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md @@ -28,10 +28,10 @@ docker run -it --name paddle-npu-dev -v $(pwd):/work \ * Download and install the Python wheel installation package ```bash # Note: You need to install the CPU version of PaddlePaddle first -python -m pip install paddlepaddle==3.0.0.dev20250430 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu -python -m pip install paddle-custom-npu -i https://www.paddlepaddle.org.cn/packages/nightly/npu +python -m pip install paddlepaddle==3.0.0.dev20250527 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu +python -m pip install paddle-custom-npu==3.0.0.dev20250527 -i https://www.paddlepaddle.org.cn/packages/nightly/npu ``` -* CANN-8.0.RC2 does not support some versions of numpy and opencv, it is recommended to install the specified versions. +* CANN-8.0.0 does not support some versions of numpy and opencv, it is recommended to install the specified versions. ```bash python -m pip install numpy==1.26.4 python -m pip install opencv-python==3.4.18.65 diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md index 89c59bf61fb0cdfa0ac0fc9f5f82d66d4036b5a6..02268d32f71a1d43f3420f71059107066e77d89b 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md @@ -28,10 +28,10 @@ docker run -it --name paddle-npu-dev -v $(pwd):/work \ * 下载安装 wheel 安装包 ```bash # 注意需要先安装飞桨 cpu 版本 -python -m pip install paddlepaddle==3.0.0.dev20250430 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu -python -m pip install paddle-custom-npu -i https://www.paddlepaddle.org.cn/packages/nightly/npu +python -m pip install paddlepaddle==3.0.0.dev20250527 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu +python -m pip install paddle-custom-npu==3.0.0.dev20250527 -i https://www.paddlepaddle.org.cn/packages/nightly/npu ``` -* CANN-8.0.RC2 对 numpy 和 opencv 部分版本不支持,建议安装指定版本 +* CANN-8.0.0 对 numpy 和 opencv 部分版本不支持,建议安装指定版本 ```bash python -m pip install numpy==1.26.4 python -m pip install opencv-python==3.4.18.65 diff --git a/docs/version3.x/paddleocr_and_paddlex.en.md b/docs/version3.x/paddleocr_and_paddlex.en.md index 0bad2f200c11409563c118eb85cc5d67c13904c4..bde5971eeae5a3171518af8cbdf16b18bee3413b 100644 --- a/docs/version3.x/paddleocr_and_paddlex.en.md +++ b/docs/version3.x/paddleocr_and_paddlex.en.md @@ -16,6 +16,15 @@ PaddleOCR fully reuses the capabilities of PaddleX in the inference deployment p - The high-performance inference capabilities of PaddleOCR are achieved through PaddleX's Paddle2ONNX plugin and high-performance inference plugins. - The service deployment solutions of PaddleOCR are based on PaddleX's implementations. +It is important to note that although PaddleOCR uses PaddleX at the underlying level, thanks to PaddleX’s optional dependency installation feature, **installing the PaddleOCR inference package does not include all of PaddleX’s dependencies—only those required for OCR-related tasks are installed**. Therefore, users generally do not need to worry about excessive expansion of dependency size. Tested in May 2025, in an x86-64 + Linux + Python 3.10 environment, the total size of required dependencies increased only from 717 MB to 738 MB. + +The version correspondence between PaddleOCR and PaddleX is as follows: + +| PaddleOCR Version | PaddleX Version | +| --- | --- | +| `3.0.0` | `3.0.0` | +| `3.0.1` | `3.0.1` | + ## 2. Correspondence Between PaddleOCR Pipelines and PaddleX Pipeline Registration Names | PaddleOCR Pipeline | PaddleX Pipeline Registration Name | @@ -52,7 +61,7 @@ The exported PaddleX pipeline configuration file not only includes parameters su ### 3.3 Loading Pipeline Configuration Files in CLI -By specifying the path to the PaddleX pipeline configuration file using the `--paddlex_config` parameter, PaddleOCR will read its contents as the default configuration for the pipeline. Here is an example: +By specifying the path to the PaddleX pipeline configuration file using the `--paddlex_config` parameter, PaddleOCR will read its contents as the default configuration for the pipeline (this takes precedence over the default values of individual initialization parameters). Here is an example: ```bash paddleocr ocr --paddlex_config ocr_config.yaml ... @@ -60,7 +69,7 @@ paddleocr ocr --paddlex_config ocr_config.yaml ... ### 3.4 Loading Pipeline Configuration Files in Python API -When initializing the pipeline object, you can pass the path to the PaddleX pipeline configuration file or a configuration dictionary through the `paddlex_config` parameter, and PaddleOCR will use it as the default configuration. Here is an example: +When initializing the pipeline object, you can pass the path to the PaddleX pipeline configuration file or a configuration dictionary through the `paddlex_config` parameter, and PaddleOCR will use it as the default configuration (this takes precedence over the default values of individual initialization parameters). Here is an example: ```python from paddleocr import PaddleOCR diff --git a/docs/version3.x/paddleocr_and_paddlex.md b/docs/version3.x/paddleocr_and_paddlex.md index 8f19bf7fdca66217dcd21f0b167d0d6d796bc041..a8643eadcc94705bc638ff1d5ca03f4879e805de 100644 --- a/docs/version3.x/paddleocr_and_paddlex.md +++ b/docs/version3.x/paddleocr_and_paddlex.md @@ -16,6 +16,15 @@ PaddleOCR 在推理部署环节充分复用了 PaddleX 的能力,具体包括 - PaddleOCR 的高性能推理能力通过 PaddleX 的 Paddle2ONNX 插件及高性能推理插件实现。 - PaddleOCR 的服务化部署方案基于 PaddleX 的实现。 +需要特别说明的是,尽管 PaddleOCR 在底层使用了 PaddleX,但得益于 PaddleX 的可选依赖安装功能,**安装 PaddleOCR 推理包时并不会安装 PaddleX 的全部依赖,而只会安装 OCR 类任务需要使用到的依赖**,用户通常无需关心依赖体积的过度膨胀问题。2025 年 5 月测试,在 x86-64 + Linux + Python 3.10 环境中,需要安装的依赖总体积仅仅从 717 MB 增加到 738 MB。 + +PaddleOCR 和 PaddleX 的版本存在如下对应关系: + +| PaddleOCR 版本 | PaddleX 版本 | +| --- | --- | +| `3.0.0` | `3.0.0` | +| `3.0.1` | `3.0.1` | + ## 2. PaddleOCR 产线与 PaddleX 产线注册名的对应关系 | PaddleOCR 产线 | PaddleX 产线注册名 | @@ -52,7 +61,7 @@ pipeline.export_paddlex_config_to_yaml("ocr_config.yaml") ### 3.3 在 CLI 中加载产线配置文件 -通过 `--paddlex_config` 参数指定 PaddleX 产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线的默认配置。示例如下: +通过 `--paddlex_config` 参数指定 PaddleX 产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线的默认配置(优先级高于各参数默认初始化的值)。示例如下: ```bash paddleocr ocr --paddlex_config ocr_config.yaml ... @@ -60,7 +69,7 @@ paddleocr ocr --paddlex_config ocr_config.yaml ... ### 3.4 在 Python API 中加载产线配置文件 -初始化产线对象时,可通过 `paddlex_config` 参数传入 PaddleX 产线配置文件路径或配置字典,PaddleOCR 会将其作为默认配置。示例如下: +初始化产线对象时,可通过 `paddlex_config` 参数传入 PaddleX 产线配置文件路径或配置字典,PaddleOCR 会将其作为默认配置(优先级高于各参数默认初始化的值)。示例如下: ```python from paddleocr import PaddleOCR diff --git a/docs/version3.x/pipeline_usage/OCR.en.md b/docs/version3.x/pipeline_usage/OCR.en.md index a531766557befb978778f93ddc9c3ad1d899bf3f..6c06dce616297e57b293d29da18351f4a2eec597 100644 --- a/docs/version3.x/pipeline_usage/OCR.en.md +++ b/docs/version3.x/pipeline_usage/OCR.en.md @@ -2,13 +2,13 @@ comments: true --- -# General OCR Pipeline Usage Guide +# General OCR Pipeline Usage Tutorial ## 1. OCR Pipeline Introduction OCR is a technology that converts text from images into editable text. It is widely used in fields such as document digitization, information extraction, and data processing. OCR can recognize printed text, handwritten text, and even certain types of fonts and symbols. -The general OCR pipeline is used to solve text recognition tasks by extracting text information from images and outputting it in text form. This pipeline supports the use of PP-OCRv3, PP-OCRv4, and PP-OCRv5 models, with the default model being the PP-OCRv5_mobile model released by PaddleOCR 3.0, which improves by 13 percentage points over PP-OCRv4_mobile in various scenarios. +The general OCR pipeline is used to solve text recognition tasks by extracting text information from images and outputting it in text form. This pipeline supports the use of PP-OCRv3, PP-OCRv4, and PP-OCRv5 models, with the default model being the PP-OCRv5_server model released by PaddleOCR 3.0, which improves by 13 percentage points over PP-OCRv4_server in various scenarios. @@ -16,7 +16,7 @@ The general OCR pipeline is used to solve text recognition tasks by extracting t - [Document Image Orientation Classification Module](../module_usage/doc_img_orientation_classification.md) (Optional) - [Text Image Unwarping Module](../module_usage/text_image_unwarping.md) (Optional) -- [Text Line Orientation Classification Module](../module_usage/text_line_orientation_classification.md) (Optional) +- [Text Line Orientation Classification Module](../module_usage/textline_orientation_classification.md) (Optional) - [Text Detection Module](../module_usage/text_detection.md) - [Text Recognition Module](../module_usage/text_recognition.md) @@ -49,7 +49,7 @@ In this pipeline, you can select models based on the benchmark test data provide
                -Text Image Unwar'p Module (Optional): +Text Image Unwarp Module (Optional): @@ -70,6 +70,40 @@ In this pipeline, you can select models based on the benchmark test data provide
                +
                +Text Line Orientation Classification Module (Optional): + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                ModelModel Download LinkTop-1 Accuracy (%)GPU Inference Time (ms)
                [Normal Mode / High-Performance Mode]
                CPU Inference Time (ms)Model Size (M)Description
                PP-LCNet_x0_25_textline_oriInference Model/Training Model98.85--0.96Text line classification model based on PP-LCNet_x0_25, with two classes: 0 degrees and 180 degrees
                PP-LCNet_x1_0_textline_oriInference Model/Training Model99.42--6.5Text line classification model based on PP-LCNet_x1_0, with two classes: 0 degrees and 180 degrees
                +
                +
                Text Detection Module: @@ -184,7 +218,7 @@ en_PP-OCRv4_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Model/Inference Model/Training Model - - + + - - + +
                83.28 4.82 / 1.20 16.74 / 4.6488Lightweight model optimized for edge devices.11 MLightweight recognition model of PP-OCRv4 with high inference efficiency, deployable on various hardware devices including edge devices
                PP-OCRv4_server_rec Inference Model/Training Model 85.19 6.58 / 2.43 33.17 / 33.17151High-accuracy server-side model.87 MServer-side model of PP-OCRv4 with high inference accuracy, deployable on various server platforms
                PP-OCRv3_mobile_recInference Model/Inference Model/ -SVTRv2, developed by FVL's OpenOCR team, won first prize in the PaddleOCR Algorithm Challenge, improving end-to-end recognition accuracy by 6% over PP-OCRv4. +SVTRv2 is a server-side text recognition model developed by the OpenOCR team from Fudan University Vision and Learning Lab (FVL). It won first prize in the PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition, improving end-to-end recognition accuracy by 6% compared to PP-OCRv4 on List A.
                - - - - - + + + + + @@ -308,19 +342,19 @@ SVTRv2, developed by FVL's OpenOCR team, won first prize in the PaddleOCR Algori - - + +
                ModelDownload LinksAccuracy(%)GPU Inference Time (ms)
                [Standard / High-Performance]
                CPU Inference Time (ms)
                [Standard / High-Performance]
                Model Size (MB)ModelDownload LinkRecognition Avg Accuracy(%)GPU Inference Time (ms)
                [Standard Mode / High Performance Mode]
                CPU Inference Time (ms)
                [Standard Mode / High Performance Mode]
                Model Size (M) Description
                65.07 5.93 / 1.62 20.73 / 7.3270RepSVTR, a mobile-optimized version of SVTRv2, won first prize in the PaddleOCR Challenge, improving accuracy by 2.5% over PP-OCRv4 with comparable speed.22.1 MRepSVTR is a mobile text recognition model based on SVTRv2. It won first prize in the PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition, improving end-to-end recognition accuracy by 2.5% compared to PP-OCRv4 on List B while maintaining comparable inference speed.
                -* English Recognition Models +* English Recognition Models - - - - - + + + + + @@ -329,8 +363,8 @@ en_PP-OCRv4_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/Inference Model/example), or directory (e.g., /root/data/); -
              • List: List of inputs, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"].
              • - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
                Command line supports more parameter settings. Click to expand for detailed instructions on command line parameters. +
                ModelDownload LinksAccuracy(%)GPU Inference Time (ms)
                [Standard / High-Performance]
                CPU Inference Time (ms)
                [Standard / High-Performance]
                Model Size (MB)ModelDownload LinkRecognition Avg Accuracy(%)GPU Inference Time (ms)
                [Standard Mode / High Performance Mode]
                CPU Inference Time (ms)
                [Standard Mode / High Performance Mode]
                Model Size (M) Description
                Python Var|str|list
                save_pathPath to save inference results. If None, results are not saved locally.str
                doc_orientation_classify_model_nameName of the document orientation classification model. If None, the default pipeline model is used.strNone
                doc_orientation_classify_model_dirDirectory path of the document orientation classification model. If None, the official model is downloaded.strNone
                doc_unwarping_model_nameName of the text image correction model. If None, the default pipeline model is used.strNone
                doc_unwarping_model_dirDirectory path of the text image correction model. If None, the official model is downloaded.strNone
                text_detection_model_nameName of the text detection model. If None, the default pipeline model is used.strNone
                text_detection_model_dirDirectory path of the text detection model. If None, the official model is downloaded.strNone
                text_line_orientation_model_nameName of the text line orientation model. If None, the default pipeline model is used.strNone
                text_line_orientation_model_dirDirectory path of the text line orientation model. If None, the official model is downloaded.strNone
                text_line_orientation_batch_sizeBatch size for the text line orientation model. If None, defaults to 1.intNone
                text_recognition_model_nameName of the text recognition model. If None, the default pipeline model is used.strNone
                text_recognition_model_dirDirectory path of the text recognition model. If None, the official model is downloaded.strNone
                text_recognition_batch_sizeBatch size for the text recognition model. If None, defaults to 1.intNone
                use_doc_orientation_classifyWhether to enable document orientation classification. If None, defaults to pipeline initialization value (True).boolNone
                use_doc_unwarpingWhether to enable text image correction. If None, defaults to pipeline initialization value (True).boolNone
                use_textline_orientationWhether to enable text line orientation classification. If None, defaults to pipeline initialization value (True).boolNone
                text_det_limit_side_lenMaximum side length limit for text detection. -
                  -
                • int: Any integer > 0;
                • -
                • None: If None, defaults to pipeline initialization value (960).
                • -
                -
                intNone
                text_det_limit_typeSide length limit type for text detection. -
                  -
                • str: Supports min (ensures shortest side ≥ det_limit_side_len) or max (ensures longest side ≤ limit_side_len);
                • -
                • None: If None, defaults to pipeline initialization value (max).
                • -
                -
                strNone
                text_det_threshPixel threshold for text detection. Pixels with scores > this threshold are considered text. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization value (0.3).
                • -
                -
                floatNone
                text_det_box_threshBox threshold for text detection. Detected regions with average scores > this threshold are retained. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization value (0.6).
                • -
                -
                floatNone
                text_det_unclip_ratioExpansion ratio for text detection. Larger values expand text regions more. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization value (2.0).
                • -
                -
                floatNone
                text_det_input_shapeInput shape for text detection.tupleNone
                text_rec_score_threshScore threshold for text recognition. Results with scores > this threshold are retained. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization value (0.0, no threshold).
                • -
                -
                floatNone
                text_rec_input_shapeInput shape for text recognition.tupleNone
                langSpecifies the OCR model language. -
                  -
                • ch: Chinese;
                • -
                • en: English;
                • -
                • korean: Korean;
                • -
                • japan: Japanese;
                • -
                • chinese_cht: Traditional Chinese;
                • -
                • te: Telugu;
                • -
                • ka: Kannada;
                • -
                • ta: Tamil;
                • -
                • None: If None, defaults to ch.
                • -
                -
                strNone
                ocr_versionOCR model version. -
                  -
                • PP-OCRv5: Uses PP-OCRv5 models;
                • -
                • PP-OCRv4: Uses PP-OCRv4 models;
                • -
                • PP-OCRv3: Uses PP-OCRv3 models;
                • -
                • None: If None, defaults to PP-OCRv5 models.
                • -
                -
                strNone
                deviceDevice for inference. Supports: -
                  -
                • CPU: cpu;
                • -
                • GPU: gpu:0 (first GPU);
                • -
                • NPU: npu:0;
                • -
                • XPU: xpu:0;
                • -
                • MLU: mlu:0;
                • -
                • DCU: dcu:0;
                • -
                • None: If None, defaults to GPU 0 (if available) or CPU.
                • -
                -
                strNone
                enable_hpiWhether to enable high-performance inference.boolFalse
                use_tensorrtWhether to use TensorRT for acceleration.boolFalse
                min_subgraph_sizeMinimum subgraph size for model optimization.int3
                precisionComputation precision (e.g., fp32, fp16).strfp32
                enable_mkldnnWhether to enable MKL-DNN acceleration. If None, enabled by default.boolNone
                cpu_threadsNumber of CPU threads for inference.int8
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - -
                ParameterParameter DescriptionParameter TypeDefault Value
                inputData to be predicted, required. Local path of an image file or PDF file: /root/data/img.jpg; URL link, such as the network URL of an image file or PDF file: Example; Local directory, which must contain images to be predicted, such as the local path: /root/data/ (currently, predicting PDFs in a directory is not supported; PDFs need to specify the exact file path). +str
                save_pathPath to save inference result files. If not set, inference results will not be saved locally.str
                doc_orientation_classify_model_nameName of the document orientation classification model. If not set, the pipeline default model will be used.str
                doc_orientation_classify_model_dirDirectory path of the document orientation classification model. If not set, the official model will be downloaded.str
                doc_unwarping_model_nameName of the text image unwarping model. If not set, the pipeline default model will be used.str
                doc_unwarping_model_dirDirectory path of the text image unwarping model. If not set, the official model will be downloaded.str
                text_detection_model_nameName of the text detection model. If not set, the pipeline default model will be used.str
                text_detection_model_dirDirectory path of the text detection model. If not set, the official model will be downloaded.str
                textline_orientation_model_nameName of the text line orientation model. If not set, the pipeline default model will be used.str
                textline_orientation_model_dirDirectory path of the text line orientation model. If not set, the official model will be downloaded.str
                textline_orientation_batch_sizeBatch size for the text line orientation model. If not set, the default batch size will be 1.int
                text_recognition_model_nameName of the text recognition model. If not set, the pipeline default model will be used.str
                text_recognition_model_dirDirectory path of the text recognition model. If not set, the official model will be downloaded.str
                text_recognition_batch_sizeBatch size for the text recognition model. If not set, the default batch size will be 1.int
                use_doc_orientation_classifyWhether to load and use the document orientation classification module. If not set, the pipeline's initialized value for this parameter (initialized to True) will be used.bool
                use_doc_unwarpingWhether to load and use the text image unwarping module. If not set, the pipeline's initialized value for this parameter (initialized to True) will be used.bool
                use_textline_orientationWhether to load and use the text line orientation module. If not set, the pipeline's initialized value for this parameter (initialized to True) will be used.bool
                text_det_limit_side_lenImage side length limitation for text detection. +Any integer greater than 0. If not set, the pipeline's initialized value for this parameter (initialized to 64) will be used. +int
                text_det_limit_typeType of side length limit for text detection. +Supports min and max. min means ensuring the shortest side of the image is not smaller than det_limit_side_len, and max means ensuring the longest side of the image is not larger than limit_side_len. If not set, the pipeline's initialized value for this parameter (initialized to min) will be used. +str
                text_det_threshPixel threshold for text detection. In the output probability map, pixels with scores higher than this threshold will be considered text pixels.Any floating-point number greater than 0. If not set, the pipeline's initialized value for this parameter (0.3) will be used. +float
                text_det_box_threshText detection box threshold. If the average score of all pixels within the detected result boundary is higher than this threshold, the result will be considered a text region. +Any floating-point number greater than 0. If not set, the pipeline's initialized value for this parameter (0.6) will be used. +float
                text_det_unclip_ratioText detection expansion coefficient. This method is used to expand the text region—the larger the value, the larger the expanded area. +Any floating-point number greater than 0. If not set, the pipeline's initialized value for this parameter (2.0) will be used. +float
                text_det_input_shapeInput shape for text detection, you can set three values to represent C, H, and W.int
                text_rec_score_threshText recognition threshold. Text results with scores higher than this threshold will be retained.Any floating-point number greater than 0 +. If not set, the pipeline's initialized value for this parameter (0.0, i.e., no threshold) will be used. +float
                text_rec_input_shapeInput shape for text recognition.tuple
                langOCR model language to use. Please refer to the detailed list of languages below. +str
                ocr_versionOCR version, note that not every ocr_version supports all lang. +
                  +
                • PP-OCRv5: Use PP-OCRv5 series models; +
                • PP-OCRv4: Use PP-OCRv4 series models; +
                • PP-OCRv3: Use PP-OCRv3 series models.
                • +
                +
                str
                det_model_dirDeprecated. Please refer text_detection_model_dir , they cannot be specified simultaneously with the new parameters.str
                det_limit_side_lenDeprecated. Please refer text_det_limit_side_len , they cannot be specified simultaneously with the new parameters.int
                det_limit_typeDeprecated. Please refer text_det_limit_type , they cannot be specified simultaneously with the new parameters. +str
                det_db_threshDeprecated. Please refer text_det_thresh , they cannot be specified simultaneously with the new parameters. +float
                det_db_box_threshDeprecated. Please refer text_det_box_thresh , they cannot be specified simultaneously with the new parameters. +float
                det_db_unclip_ratioDeprecated. Please refer text_det_unclip_ratio , they cannot be specified simultaneously with the new parameters. +float
                rec_model_dirDeprecated. Please refer text_recognition_model_dir , they cannot be specified simultaneously with the new parameters.str
                rec_batch_numDeprecated. Please refer text_recognition_batch_size , they cannot be specified simultaneously with the new parameters.int
                use_angle_clsDeprecated. Please refer use_textline_orientation , they cannot be specified simultaneously with the new parameters.bool
                cls_model_dirDeprecated. Please refer textline_orientation_model_dir , they cannot be specified simultaneously with the new parameters.str
                cls_batch_numDeprecated. Please refer textline_orientation_batch_size , they cannot be specified simultaneously with the new parameters.int
                deviceDevice for inference. Supports specifying a specific card number: +
                  +
                • CPU: cpu indicates using CPU for inference;
                • +
                • GPU: gpu:0 indicates using the 1st GPU for inference;
                • +
                • NPU: npu:0 indicates using the 1st NPU for inference;
                • +
                • XPU: xpu:0 indicates using the 1st XPU for inference;
                • +
                • MLU: mlu:0 indicates using the 1st MLU for inference;
                • +
                • DCU: dcu:0 indicates using the 1st DCU for inference;
                • +
                If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. +
                str
                enable_hpiWhether to enable high-performance inference.boolFalse
                use_tensorrtWhether to use TensorRT for inference acceleration.boolFalse
                min_subgraph_sizeMinimum subgraph size for optimizing model subgraph computation.int3
                precisionComputational precision, such as fp32, fp16.strfp32
                enable_mkldnnWhether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +boolTrue
                cpu_threadsNumber of threads used for inference on CPU.int8
                paddlex_configPath to PaddleX pipeline configuration file.Path to the PaddleX pipeline configuration file. strNone
                -
                -
                + + + +
                Results are printed to the terminal: @@ -848,6 +917,126 @@ Results are printed to the terminal: If `save_path` is specified, the visualization results will be saved under `save_path`. The visualization output is shown below: +
                Supported Language List + + + + + + + + + + + + + + + + + + + + + +
                OCR_versionLanguages
                PP-OCRv5PP-OCRv5 support the following languages: +
                  +
                • ch: Simplified Chinese;
                • +
                • chinese_cht: Traditional Chinese;
                • +
                • en: English;
                • +
                • japan: Japanese;
                • +
                • korean: Korean;
                • +
                • te: Telugu;
                • +
                • ka: Kannada;
                • +
                • ta: Tamil.
                • +
                PP-OCRv4PP-OCRv4 support the following languages: +
                  +
                • ch: Simplified Chinese;
                • +
                • en: English.
                • +
                PP-OCRv3PP-OCRv3 support the following languages: +
                +Language List +
                  +
                • af: Afrikaans;
                • +
                • az: Azerbaijani;
                • +
                • bs: Bosnian;
                • +
                • cs: Czech;
                • +
                • cy: Welsh;
                • +
                • da: Danish;
                • +
                • de: German;
                • +
                • es: Spanish;
                • +
                • et: Estonian;
                • +
                • fr: French;
                • +
                • ga: Irish;
                • +
                • hr: Croatian;
                • +
                • hu: Hungarian;
                • +
                • id: Indonesian;
                • +
                • is: Icelandic;
                • +
                • it: Italian;
                • +
                • ku: Kurdish;
                • +
                • la: Latin;
                • +
                • lt: Lithuanian;
                • +
                • lv: Latvian;
                • +
                • mi: Maori;
                • +
                • ms: Malay;
                • +
                • mt: Maltese;
                • +
                • nl: Dutch;
                • +
                • no: Norwegian;
                • +
                • oc: Occitan;
                • +
                • pi: Pali;
                • +
                • pl: Polish;
                • +
                • pt: Portuguese;
                • +
                • ro: Romanian;
                • +
                • rs_latin: Serbian (Latin);
                • +
                • sk: Slovak;
                • +
                • sl: Slovenian;
                • +
                • sq: Albanian;
                • +
                • sv: Swedish;
                • +
                • sw: Swahili;
                • +
                • tl: Tagalog;
                • +
                • tr: Turkish;
                • +
                • uz: Uzbek;
                • +
                • vi: Vietnamese;
                • +
                • french: French;
                • +
                • german: German;
                • +
                • ar: Arabic;
                • +
                • fa: Persian;
                • +
                • ug: Uighur;
                • +
                • ur: Urdu;
                • +
                • ru: Russian;
                • +
                • rs_cyrillic: Serbian (Cyrillic);
                • +
                • be: Belarusian;
                • +
                • bg: Bulgarian;
                • +
                • uk: Ukrainian;
                • +
                • mn: Mongolian;
                • +
                • abq: Abkhaz;
                • +
                • ady: Adyghe;
                • +
                • kbd: Kabardian;
                • +
                • ava: Avar;
                • +
                • dar: Dargwa;
                • +
                • inh: Ingush;
                • +
                • che: Chechen;
                • +
                • lbe: Lak;
                • +
                • lez: Lezgian;
                • +
                • tab: Tabasaran;
                • +
                • hi: Hindi;
                • +
                • mr: Marathi;
                • +
                • ne: Nepali;
                • +
                • bh: Bhojpuri;
                • +
                • mai: Maithili;
                • +
                • ang: Angika;
                • +
                • bho: Bhojpuri;
                • +
                • mah: Magahi;
                • +
                • sck: Nagpur;
                • +
                • new: Newar;
                • +
                • gom: Goan Konkani;
                • +
                • sa: Sanskrit;
                • +
                • bgc: Haryanvi.
                • +
                +
                +
                +
                + ### 2.2 Python Script Integration @@ -864,6 +1053,13 @@ ocr = PaddleOCR( # ocr = PaddleOCR(lang="en") # Uses English model by specifying language parameter # ocr = PaddleOCR(ocr_version="PP-OCRv4") # Uses other PP-OCR versions via version parameter # ocr = PaddleOCR(device="gpu") # Enables GPU acceleration for model inference via device parameter +# ocr = PaddleOCR( +# text_detection_model_name="PP-OCRv5_mobile_det", +# text_recognition_model_name="PP-OCRv5_mobile_rec", +# use_doc_orientation_classify=False, +# use_doc_unwarping=False, +# use_textline_orientation=False, +# ) # Switch to PP-OCRv5_mobile models result = ocr.predict("./general_ocr_002.png") for res in result: res.print() @@ -871,493 +1067,470 @@ for res in result: res.save_to_json("output") ``` -The Python script above performs the following steps: - -
                (1) Initialize the OCR pipeline with PaddleOCR(). Parameter details: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +In the above Python script, the following steps are performed: + +
                (1) Instantiate the OCR pipeline object via PaddleOCR(), with specific parameter descriptions as follows: + +
                ParameterDescriptionTypeDefault
                doc_orientation_classify_model_nameName of the document orientation model. If None, uses the default pipeline model.strNone
                doc_orientation_classify_model_dirDirectory path of the document orientation model. If None, downloads the official model.strNone
                doc_unwarping_model_nameName of the text image correction model. If None, uses the default pipeline model.strNone
                doc_unwarping_model_dirDirectory path of the text image correction model. If None, downloads the official model.strNone
                text_detection_model_nameName of the text detection model. If None, uses the default pipeline model.strNone
                text_detection_model_dirDirectory path of the text detection model. If None, downloads the official model.strNone
                text_line_orientation_model_nameName of the text line orientation model. If None, uses the default pipeline model.strNone
                text_line_orientation_model_dirDirectory path of the text line orientation model. If None, downloads the official model.strNone
                text_line_orientation_batch_sizeBatch size for the text line orientation model. If None, defaults to 1.intNone
                text_recognition_model_nameName of the text recognition model. If None, uses the default pipeline model.strNone
                text_recognition_model_dirDirectory path of the text recognition model. If None, downloads the official model.strNone
                text_recognition_batch_sizeBatch size for the text recognition model. If None, defaults to 1.intNone
                use_doc_orientation_classifyWhether to enable document orientation classification. If None, defaults to pipeline initialization (True).boolNone
                use_doc_unwarpingWhether to enable text image correction. If None, defaults to pipeline initialization (True).boolNone
                use_textline_orientationWhether to enable text line orientation classification. If None, defaults to pipeline initialization (True).boolNone
                text_det_limit_side_lenMaximum side length limit for text detection. -
                  -
                • int: Any integer > 0;
                • -
                • None: If None, defaults to pipeline initialization (960).
                • -
                -
                intNone
                text_det_limit_typeSide length limit type for text detection. -
                  -
                • str: Supports min (ensures shortest side ≥ det_limit_side_len) or max (ensures longest side ≤ limit_side_len);
                • -
                • None: If None, defaults to pipeline initialization (max).
                • -
                -
                strNone
                text_det_threshPixel threshold for text detection. Pixels with scores > this threshold are considered text. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization (0.3).
                • -
                -
                floatNone
                text_det_box_threshBox threshold for text detection. Detected regions with average scores > this threshold are retained. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization (0.6).
                • -
                -
                floatNone
                text_det_unclip_ratioExpansion ratio for text detection. Larger values expand text regions more. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization (2.0).
                • -
                -
                floatNone
                text_det_input_shapeInput shape for text detection.tupleNone
                text_rec_score_threshScore threshold for text recognition. Results with scores > this threshold are retained. -
                  -
                • float: Any float > 0;
                • -
                • None: If None, defaults to pipeline initialization (0.0, no threshold).
                • -
                -
                floatNone
                text_rec_input_shapeInput shape for text recognition.tupleNone
                langSpecifies the OCR model language. -
                  -
                • ch: Chinese;
                • -
                • en: English;
                • -
                • korean: Korean;
                • -
                • japan: Japanese;
                • -
                • chinese_cht: Traditional Chinese;
                • -
                • te: Telugu;
                • -
                • ka: Kannada;
                • -
                • ta: Tamil;
                • -
                • None: If None, defaults to ch.
                • -
                -
                strNone
                ocr_versionOCR model version. -
                  -
                • PP-OCRv5: Uses PP-OCRv5 models;
                • -
                • PP-OCRv4: Uses PP-OCRv4 models;
                • -
                • PP-OCRv3: Uses PP-OCRv3 models;
                • -
                • None: If None, defaults to PP-OCRv5 models.
                • -
                -
                strNone
                deviceDevice for inference. Supports: -
                  -
                • CPU: cpu;
                • -
                • GPU: gpu:0 (first GPU);
                • -
                • NPU: npu:0;
                • -
                • XPU: xpu:0;
                • -
                • MLU: mlu:0;
                • -
                • DCU: dcu:0;
                • -
                • None: If None, defaults to GPU 0 (if available) or CPU.
                • -
                -
                strNone
                enable_hpiWhether to enable high-performance inference.boolFalse
                use_tensorrtWhether to use TensorRT for acceleration.boolFalse
                min_subgraph_sizeMinimum subgraph size for model optimization.int3
                precisionComputation precision (e.g., fp32, fp16).strfp32
                enable_mkldnnWhether to enable MKL-DNN acceleration. If None, enabled by default.boolNone
                cpu_threadsNumber of CPU threads for inference.int8
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - -
                ParameterParameter DescriptionParameter TypeDefault Value
                doc_orientation_classify_model_nameName of the document orientation classification model. If set to None, the pipeline's default model will be used.strNone
                doc_orientation_classify_model_dirDirectory path of the document orientation classification model. If set to None, the official model will be downloaded.strNone
                doc_unwarping_model_nameName of the text image unwarping model. If set to None, the pipeline's default model will be used.strNone
                doc_unwarping_model_dirDirectory path of the text image unwarping model. If set to None, the official model will be downloaded.strNone
                text_detection_model_nameName of the text detection model. If set to None, the pipeline's default model will be used.strNone
                text_detection_model_dirDirectory path of the text detection model. If set to None, the official model will be downloaded.strNone
                textline_orientation_model_nameName of the text line orientation model. If set to None, the pipeline's default model will be used.strNone
                textline_orientation_model_dirDirectory path of the text line orientation model. If set to None, the official model will be downloaded.strNone
                textline_orientation_batch_sizeBatch size for the text line orientation model. If set to None, the default batch size will be 1.intNone
                text_recognition_model_nameName of the text recognition model. If set to None, the pipeline's default model will be used.strNone
                text_recognition_model_dirDirectory path of the text recognition model. If set to None, the official model will be downloaded.strNone
                text_recognition_batch_sizeBatch size for the text recognition model. If set to None, the default batch size will be 1.intNone
                use_doc_orientation_classifyWhether to load and use the document orientation classification module. If set to None, the pipeline's initialized value for this parameter (initialized to True) will be used.boolNone
                use_doc_unwarpingWhether to load and use the text image unwarping module. If set to None, the pipeline's initialized value for this parameter (initialized to True) will be used.boolNone
                use_textline_orientationWhether to load and use the text line orientation module. If set to None, the pipeline's initialized value for this parameter (initialized to True) will be used.boolNone
                text_det_limit_side_lenImage side length limitation for text detection. +
                  +
                • int: Any integer greater than 0;
                • +
                • None: If set to None, the pipeline's initialized value for this parameter (initialized to 64) will be used.
                • +
                +
                intNone
                text_det_limit_typeType of side length limit for text detection. +
                  +
                • str: Supports min and max, where min means ensuring the shortest side of the image is not smaller than det_limit_side_len, and max means ensuring the longest side of the image is not larger than limit_side_len;
                • +
                • None: If set to None, the pipeline's initialized value for this parameter (initialized to min) will be used.
                • +
                +
                strNone
                text_det_threshPixel threshold for text detection. Pixels with scores higher than this threshold in the output probability map will be considered text pixels. +
                  +
                • float: Any floating-point number greater than 0; +
                • None: If set to None, the pipeline's initialized value for this parameter (0.3) will be used.
                +
                floatNone
                text_det_box_threshBox threshold for text detection. A detection result will be considered a text region if the average score of all pixels within the bounding box is higher than this threshold. +
                  +
                • float: Any floating-point number greater than 0; +
                • None: If set to None, the pipeline's initialized value for this parameter (0.6) will be used.
                +
                floatNone
                text_det_unclip_ratioDilation coefficient for text detection. This method is used to dilate the text region, and the larger this value, the larger the dilated area. +
                  +
                • float: Any floating-point number greater than 0; +
                • None: If set to None, the pipeline's initialized value for this parameter (2.0) will be used.
                +
                floatNone
                text_det_input_shapeInput shape for text detection.tupleNone
                text_rec_score_threshRecognition score threshold for text. Text results with scores higher than this threshold will be retained. +
                  +
                • float: Any floating-point number greater than 0; +
                • None: If set to None, the pipeline's initialized value for this parameter (0.0, i.e., no threshold) will be used.
                +
                floatNone
                text_rec_input_shapeInput shape for text recognition.tupleNone
                langOCR model language to use. Please refer to the detailed list of languages above. +strNone
                ocr_versionOCR version, note that not every ocr_version supports all lang. +
                  +
                • PP-OCRv5: Use PP-OCRv5 series models;
                • +
                • PP-OCRv4: Use PP-OCRv4 series models;
                • +
                • PP-OCRv3: Use PP-OCRv3 series models.
                • +
                +
                strNone
                deviceDevice for inference. Supports specifying a specific card number: +
                  +
                • CPU: e.g., cpu for CPU inference;
                • +
                • GPU: e.g., gpu:0 for inference on the 1st GPU;
                • +
                • NPU: e.g., npu:0 for inference on the 1st NPU;
                • +
                • XPU: e.g., xpu:0 for inference on the 1st XPU;
                • +
                • MLU: e.g., mlu:0 for inference on the 1st MLU;
                • +
                • DCU: e.g., dcu:0 for inference on the 1st DCU;
                • +
                • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
                • +
                +
                strNone
                enable_hpiWhether to enable high-performance inference.boolFalse
                use_tensorrtWhether to use TensorRT for inference acceleration.boolFalse
                min_subgraph_sizeMinimum subgraph size for optimizing subgraph computation.int3
                precisionComputational precision, such as fp32, fp16.str"fp32"
                enable_mkldnnWhether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set.boolTrue
                cpu_threadsNumber of threads used for CPU inference.int8
                paddlex_configPath to PaddleX pipeline configuration file.Path to the PaddleX pipeline configuration file. str None
                -
                - -
                (2) Call the predict() method for inference. Alternatively, predict_iter() returns a generator for memory-efficient batch processing. Parameters: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +
                ParameterDescriptionTypeDefault
                inputInput data (required). Supports: -
                  -
                • Python Var: e.g., numpy.ndarray image data;
                • -
                • str: Local file path (e.g., /root/data/img.jpg), URL (e.g., example), or directory (e.g., /root/data/);
                • -
                • List: List of inputs, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"].
                • -
                -
                Python Var|str|list
                deviceSame as initialization.strNone
                use_doc_orientation_classifyWhether to enable document orientation classification during inference.boolNone
                use_doc_unwarpingWhether to enable text image correction during inference.boolNone
                use_textline_orientationWhether to enable text line orientation classification during inference.boolNone
                text_det_limit_side_lenSame as initialization.intNone
                text_det_limit_typeSame as initialization.strNone
                text_det_threshSame as initialization.floatNone
                text_det_box_threshSame as initialization.floatNone
                text_det_unclip_ratioSame as initialization.floatNone
                +
                + +
                (2) Invoke the predict() method of the OCR pipeline object for inference prediction, which returns a results list. Additionally, the pipeline provides the predict_iter() method. Both methods are completely consistent in parameter acceptance and result return, except that predict_iter() returns a generator, which can process and obtain prediction results incrementally, suitable for handling large datasets or scenarios where memory saving is desired. You can choose to use either of these two methods according to actual needs. The following are the parameters and descriptions of the predict() method: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - +
                ParameterParameter DescriptionParameter TypeDefault Value
                inputData to be predicted, supporting multiple input types, required. +
                  +
                • Python Var: Image data represented by numpy.ndarray;
                • +
                • str: Local path of an image file or PDF file: /root/data/img.jpg; URL link, such as the network URL of an image file or PDF file: example; local directory, which needs to contain images to be predicted, such as the local path: /root/data/ (currently, predicting PDF files in the directory is not supported; PDF files need to specify the specific file path);
                • +
                • List: List elements must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
                • +
                +
                Python Var|str|list
                use_doc_orientation_classifyWhether to use the document orientation classification module during inference.boolNone
                use_doc_unwarpingWhether to use the text image unwarping module during inference.boolNone
                use_textline_orientationWhether to use the text line orientation classification module during inference.boolNone
                text_det_limit_side_lenThe same as the parameter during instantiation.intNone
                text_det_limit_typeThe same as the parameter during instantiation.strNone
                text_det_threshThe same as the parameter during instantiation.floatNone
                text_det_box_threshThe same as the parameter during instantiation.floatNone
                text_det_unclip_ratioThe same as the parameter during instantiation.floatNone
                text_rec_score_threshSame as initialization.The same as the parameter during instantiation. float None
                -
                (3) Processing prediction results: Each sample's prediction result is a corresponding Result object, supporting printing, saving as images, and saving as json files: +
                (3) Process the prediction results. The prediction result of each sample is a corresponding Result object, which supports operations of printing, saving as an image, and saving as a json file: - + - - - + + + - + - + - + - + - + - - + + - + - + - + - - + +
                MethodDescriptionMethod Description ParameterTypeExplanationDefaultParameter TypeParameter DescriptionDefault Value
                print()Print results to terminalPrint the results to the terminal format_json boolWhether to format output with JSON indentationWhether to format the output content with JSON indentation. True
                indent intIndentation level for prettifying JSON output (only when format_json=True)Specify the indentation level to beautify the output JSON data and make it more readable, only valid when format_json is True. 4
                ensure_ascii boolWhether to escape non-ASCII characters to Unicode (only when format_json=True)Control whether to escape non-ASCII characters as Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters, only valid when format_json is True. False
                save_to_json()Save results as JSON fileSave the results as a json-formatted file. save_path strOutput file path (uses input filename when directory specified)NoneFile path to save. When it is a directory, the saved file name will be consistent with the input file type name.No default
                indent intIndentation level for prettifying JSON output (only when format_json=True)Specify the indentation level to beautify the output JSON data and make it more readable, only valid when format_json is True. 4
                ensure_ascii boolWhether to escape non-ASCII characters (only when format_json=True)Control whether to escape non-ASCII characters as Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters, only valid when format_json is True. False
                save_to_img()Save results as image fileSave the results as an image-formatted file save_path strOutput path (supports directory or file path)NoneFile path to save, supporting directory or file path.No default
                -- The print() method outputs results to terminal with the following structure: - - - input_path: (str) Input image path - - - page_index: (Union[int, None]) PDF page number (if input is PDF), otherwise None - - - model_settings: (Dict[str, bool]) Pipeline configuration - - use_doc_preprocessor: (bool) Whether document preprocessing is enabled - - use_textline_orientation: (bool) Whether text line orientation classification is enabled - - - doc_preprocessor_res: (Dict[str, Union[str, Dict[str, bool], int]]) Document preprocessing results (only when use_doc_preprocessor=True) - - input_path: (Union[str, None]) Preprocessor input path (None for numpy.ndarray input) - - model_settings: (Dict) Preprocessor configuration - - use_doc_orientation_classify: (bool) Whether document orientation classification is enabled - - use_doc_unwarping: (bool) Whether text image correction is enabled - - angle: (int) Document orientation prediction (0-3 for 0°,90°,180°,270°; -1 if disabled) - - - dt_polys: (List[numpy.ndarray]) Text detection polygons (4 vertices per box, shape=(4,2), dtype=int16) - - - dt_scores: (List[float]) Text detection confidence scores - - - text_det_params: (Dict[str, Dict[str, int, float]]) Text detection parameters - - limit_side_len: (int) Image side length limit - - limit_type: (str) Length limit handling method - - thresh: (float) Text pixel classification threshold - - box_thresh: (float) Detection box confidence threshold - - unclip_ratio: (float) Text region expansion ratio - - text_type: (str) Fixed as "general" - - - textline_orientation_angles: (List[int]) Text line orientation predictions (actual angles when enabled, [-1,-1,-1] when disabled) - - - text_rec_score_thresh: (float) Text recognition score threshold - - - rec_texts: (List[str]) Recognized texts (filtered by text_rec_score_thresh) - - - rec_scores: (List[float]) Recognition confidence scores (filtered) - - - rec_polys: (List[numpy.ndarray]) Filtered detection polygons (same format as dt_polys) - - - rec_boxes: (numpy.ndarray) Rectangular bounding boxes (shape=(n,4), dtype=int16) with [x_min, y_min, x_max, y_max] coordinates - -- save_to_json() saves results to specified save_path: - - Directory: saves as save_path/{your_img_basename}_res.json - - File: saves directly to specified path - - Note: Converts numpy.array to lists since JSON doesn't support numpy arrays - -- save_to_img() saves visualization results: - - Directory: saves as save_path/{your_img_basename}_ocr_res_img.{your_img_extension} - - File: saves directly (not recommended for multiple images to avoid overwriting) +
                  +
                • Calling the print() method will print the results to the terminal. The content printed to the terminal is explained as follows: +
                    +
                  • input_path: (str) Input path of the image to be predicted
                  • +
                  • page_index: (Union[int, None]) If the input is a PDF file, it indicates which page of the PDF it is; otherwise, it is None
                  • +
                  • model_settings: (Dict[str, bool]) Model parameters configured for the pipeline +
                      +
                    • use_doc_preprocessor: (bool) Control whether to enable the document preprocessing sub-pipeline
                    • +
                    • use_textline_orientation: (bool) Control whether to enable the text line orientation classification function
                    • +
                    +
                  • +
                  • doc_preprocessor_res: (Dict[str, Union[str, Dict[str, bool], int]]) Output results of the document preprocessing sub-pipeline. Only exists when use_doc_preprocessor=True +
                      +
                    • input_path: (Union[str, None]) Image path accepted by the image preprocessing sub-pipeline. When the input is numpy.ndarray, it is saved as None
                    • +
                    • model_settings: (Dict) Model configuration parameters of the preprocessing sub-pipeline +
                        +
                      • use_doc_orientation_classify: (bool) Control whether to enable document orientation classification
                      • +
                      • use_doc_unwarping: (bool) Control whether to enable text image unwarping
                      • +
                      +
                    • +
                    • angle: (int) Prediction result of document orientation classification. When enabled, the values are [0,1,2,3], corresponding to [0°,90°,180°,270°]; when disabled, it is -1
                    • +
                    +
                  • +
                  • dt_polys: (List[numpy.ndarray]) List of text detection polygon boxes. Each detection box is represented by a numpy array of 4 vertex coordinates, with the array shape being (4, 2) and the data type being int16
                  • +
                  • dt_scores: (List[float]) List of confidence scores for text detection boxes
                  • +
                  • text_det_params: (Dict[str, Dict[str, int, float]]) Configuration parameters for the text detection module +
                      +
                    • limit_side_len: (int) Side length limit value during image preprocessing
                    • +
                    • limit_type: (str) Processing method for side length limits
                    • +
                    • thresh: (float) Confidence threshold for text pixel classification
                    • +
                    • box_thresh: (float) Confidence threshold for text detection boxes
                    • +
                    • unclip_ratio: (float) Dilation coefficient for text detection boxes
                    • +
                    • text_type: (str) Type of text detection, currently fixed as "general"
                    • +
                    +
                  • +
                  • textline_orientation_angles: (List[int]) Prediction results of text line orientation classification. When enabled, actual angle values are returned (e.g., [0,0,1]); when disabled, [-1,-1,-1] is returned
                  • +
                  • text_rec_score_thresh: (float) Filtering threshold for text recognition results
                  • +
                  • rec_texts: (List[str]) List of text recognition results, containing only texts with confidence scores exceeding text_rec_score_thresh
                  • +
                  • rec_scores: (List[float]) List of text recognition confidence scores, filtered by text_rec_score_thresh
                  • +
                  • rec_polys: (List[numpy.ndarray]) List of text detection boxes filtered by confidence, in the same format as dt_polys
                  • +
                  • rec_boxes: (numpy.ndarray) Array of rectangular bounding boxes for detection boxes, with shape (n, 4) and dtype int16. Each row represents the [x_min, y_min, x_max, y_max] coordinates of a rectangular box, where (x_min, y_min) is the top-left coordinate and (x_max, y_max) is the bottom-right coordinate
                  • +
                  +
                • +
                • Calling the save_to_json() method will save the above content to the specified save_path. If a directory is specified, the save path will be save_path/{your_img_basename}_res.json. If a file is specified, it will be saved directly to that file. Since json files do not support saving numpy arrays, numpy.array types will be converted to list form.
                • +
                • Calling the save_to_img() method will save the visualization results to the specified save_path. If a directory is specified, the save path will be save_path/{your_img_basename}_ocr_res_img.{your_img_extension}. If a file is specified, it will be saved directly to that file. (The pipeline usually generates many result images, so it is not recommended to directly specify a specific file path, as multiple images will be overwritten, leaving only the last one.)
                • +
                -* Additionally, results with visualizations and predictions can be obtained through the following attributes: +

                Additionally, you can also obtain the visualized image with results and prediction results through attributes, as follows:

                - + - + - +
                AttributeDescriptionAttribute Description
                jsonRetrieves prediction results in json formatGet the prediction results in json format
                imgRetrieves visualized images in dict formatGet the visualized image in dict format
                -- The `json` attribute returns prediction results as a dict, with content identical to what's saved by the `save_to_json()` method. -- The `img` attribute returns prediction results as a dictionary containing two `Image.Image` objects under keys `ocr_res_img` (OCR result visualization) and `preprocessed_img` (preprocessing visualization). If the image preprocessing submodule isn't used, only `ocr_res_img` will be present. +
                  +
                • The prediction results obtained by the json attribute are in dict format, and the content is consistent with that saved by calling the save_to_json() method.
                • +
                • The img attribute returns a dictionary-type result. The keys are ocr_res_img and preprocessed_img, with corresponding values being two Image.Image objects: one for displaying the visualized image of OCR results and the other for displaying the visualized image of image preprocessing. If the image preprocessing submodule is not used, only ocr_res_img will be included in the dictionary.
                • +
                @@ -1625,11 +1798,9 @@ for i, res in enumerate(result["ocrResults"]):
                ## 4. Custom Development - If the default model weights provided by the General OCR Pipeline do not meet your expectations in terms of accuracy or speed for your specific scenario, you can leverage your own domain-specific or application-specific data to further fine-tune the existing models, thereby improving the recognition performance of the General OCR Pipeline in your use case. ### 4.1 Model Fine-Tuning - The general OCR pipeline consists of multiple modules. If the pipeline's performance does not meet expectations, the issue may stem from any of these modules. You can analyze poorly recognized images to identify the problematic module and refer to the corresponding fine-tuning tutorials in the table below for adjustments. @@ -1670,9 +1841,40 @@ The general OCR pipeline consists of multiple modules. If the pipeline's perform
                ### 4.2 Model Deployment -After fine-tuning the model with your private dataset, you will obtain local model weight files. You can then use these fine-tuned weights by customizing the pipeline configuration file. -1. **Obtain the Pipeline Configuration File** +After you complete fine-tuning training using a private dataset, you can obtain a local model weight file. You can then use the fine-tuned model weights by specifying the local model save path through parameters or by customizing the pipeline configuration file. + +#### 4.2.1 Specify the local model path through parameters + +When initializing the pipeline object, specify the local model path through parameters. Take the usage of the weights after fine-tuning the text detection model as an example, as follows: + +Command line mode: + +```bash +# Specify the local model path via --text_detection_model_dir +paddleocr ocr -i ./general_ocr_002.png --text_detection_model_dir your_det_model_path + +# PP-OCRv5_server_det model is used as the default text detection model. If you do not fine-tune this model, modify the model name by using --text_detection_model_name +paddleocr ocr -i ./general_ocr_002.png --text_detection_model_name PP-OCRv5_mobile_det --text_detection_model_dir your_v5_mobile_det_model_path +``` + +Script mode: + +```python + +from paddleocr import PaddleOCR + +# Specify the local model path via text_detection_model_dir +pipeline = PaddleOCR(text_detection_model_dir="./your_det_model_path") + +# PP-OCRv5_server_det model is used as the default text detection model. If you do not fine-tune this model, modify the model name by using text_detection_model_name +# pipeline = PaddleOCR(text_detection_model_name="PP-OCRv5_mobile_det", text_detection_model_dir="./your_v5_mobile_det_model_path") + +``` + +#### 4.2.2 Specify the local model path through the configuration file + +1.Obtain the pipeline configuration file Call the `export_paddlex_config_to_yaml` method of the **General OCR Pipeline** object in PaddleOCR to export the current pipeline configuration as a YAML file: @@ -1683,7 +1885,7 @@ pipeline = PaddleOCR() pipeline.export_paddlex_config_to_yaml("PaddleOCR.yaml") ``` -2. **Modify the Configuration File** +2.Modify the Configuration File After obtaining the default pipeline configuration file, replace the paths of the default model weights with the local paths of your fine-tuned model weights. For example: @@ -1692,31 +1894,31 @@ After obtaining the default pipeline configuration file, replace the paths of th SubModules: TextDetection: box_thresh: 0.6 - limit_side_len: 960 - limit_type: max + limit_side_len: 64 + limit_type: min max_side_limit: 4000 model_dir: null # Replace with the path to your fine-tuned text detection model weights - model_name: PP-OCRv5_server_det + model_name: PP-OCRv5_server_det # If the name of the fine-tuned model is different from the default model name, please modify it here as well module_name: text_detection thresh: 0.3 unclip_ratio: 1.5 TextLineOrientation: batch_size: 6 - model_dir: null - model_name: PP-LCNet_x0_25_textline_ori + model_dir: null # Replace with the path to your fine-tuned text LineOrientation model weights + model_name: PP-LCNet_x1_0_textline_ori # If the name of the fine-tuned model is different from the default model name, please modify it here as well module_name: textline_orientation TextRecognition: batch_size: 6 model_dir: null # Replace with the path to your fine-tuned text recognition model weights - model_name: PP-OCRv5_server_rec + model_name: PP-OCRv5_server_rec # If the name of the fine-tuned model is different from the default model name, please modify it here as well module_name: text_recognition score_thresh: 0.0 ...... ``` -The pipeline configuration file includes not only the parameters supported by the PaddleOCR CLI and Python API but also advanced configurations. For detailed instructions, refer to the [PaddleX Pipeline Usage Overview](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_usage/pipeline_develop_guide.html) and adjust the configurations as needed. +The pipeline configuration file includes not only the parameters supported by the PaddleOCR CLI and Python API but also advanced configurations. For detailed instructions, refer to the [PaddleX Pipeline Usage Overview](https://paddlepaddle.github.io/PaddleX/3.0/en/pipeline_usage/pipeline_develop_guide.html) and adjust the configurations as needed. -3. **Load the Configuration File in CLI** +3.Load the Configuration File in CLI After modifying the configuration file, specify its path using the `--paddlex_config` parameter in the command line. PaddleOCR will read the file and apply the configurations. Example: @@ -1724,7 +1926,7 @@ After modifying the configuration file, specify its path using the `--paddlex_co paddleocr ocr --paddlex_config PaddleOCR.yaml ... ``` -4. **Load the Configuration File in Python API** +4.Load the Configuration File in Python API When initializing the pipeline object, pass the path of the PaddleX pipeline configuration file or a configuration dictionary via the `paddlex_config` parameter. PaddleOCR will read and apply the configurations. Example: diff --git a/docs/version3.x/pipeline_usage/OCR.md b/docs/version3.x/pipeline_usage/OCR.md index 9a7d0746c557607450510bbfa96ce95fac9acc75..79194d1971db16e73d5896fbf01d336f3060b176 100644 --- a/docs/version3.x/pipeline_usage/OCR.md +++ b/docs/version3.x/pipeline_usage/OCR.md @@ -8,7 +8,7 @@ comments: true OCR(光学字符识别,Optical Character Recognition)是一种将图像中的文字转换为可编辑文本的技术。它广泛应用于文档数字化、信息提取和数据处理等领域。OCR 可以识别印刷文本、手写文本,甚至某些类型的字体和符号。 -通用 OCR 产线用于解决文字识别任务,提取图片中的文字信息以文本形式输出,本产线支持PP-OCRv3、PP-OCRv4、PP-OCRv5模型的使用,其中默认模型为 PaddleOCR3.0 发布的 PP-OCRv5_mobile 模型,其在多个场景中较 PP-OCRv4_mobile 提升 13 个百分点。 +通用 OCR 产线用于解决文字识别任务,提取图片中的文字信息以文本形式输出,本产线支持PP-OCRv3、PP-OCRv4、PP-OCRv5模型的使用,其中默认模型为 PaddleOCR3.0 发布的 PP-OCRv5_server 模型,其在多个场景中较 PP-OCRv4_server 提升 13 个百分点。 @@ -16,7 +16,7 @@ OCR(光学字符识别,Optical Character Recognition)是一种将图像中 - [文档图像方向分类模块](../module_usage/doc_img_orientation_classification.md) (可选) - [文本图像矫正模块](../module_usage/text_image_unwarping.md) (可选) -- [文本行方向分类模块](../module_usage/text_line_orientation_classification.md) (可选) +- [文本行方向分类模块](../module_usage/textline_orientation_classification.md) (可选) - [文本检测模块](../module_usage/text_detection.md) - [文本识别模块](../module_usage/text_recognition.md) @@ -70,6 +70,41 @@ OCR(光学字符识别,Optical Character Recognition)是一种将图像中 +
                +文本行方向分类模块(可选): + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                模型模型下载链接Top-1 Acc(%)GPU推理耗时(ms)CPU推理耗时 (ms)模型存储大小(M)介绍
                PP-LCNet_x0_25_textline_ori推理模型/训练模型98.85--0.96基于PP-LCNet_x0_25的文本行分类模型,含有两个类别,即0度,180度
                PP-LCNet_x1_0_textline_ori推理模型/训练模型99.42--6.5基于PP-LCNet_x1_0的文本行分类模型,含有两个类别,即0度,180度
                +
                +
                文本检测模块: @@ -151,26 +186,26 @@ PP-OCRv5_mobile_rec_infer.tar">推理模型/推理模型/训练模型 - + - + - + - + - + - + @@ -179,7 +214,7 @@ en_PP-OCRv4_mobile_rec_infer.tar">推理模型/推理模型/推理模型/训练模型 - + - + - + - + - + - + - + - +
                86.5881.53 6.65 / 2.38 32.92 / 32.92181 M74.7 M PP-OCRv4_server_rec_doc是在PP-OCRv4_server_rec的基础上,在更多中文文档数据和PP-OCR训练数据的混合数据训练而成,增加了部分繁体字、日文、特殊字符的识别能力,可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力
                PP-OCRv4_mobile_rec推理模型/训练模型83.2878.74 4.82 / 1.20 16.74 / 4.6488 M10.6 M PP-OCRv4的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中
                PP-OCRv4_server_rec 推理模型/训练模型85.19 80.61 6.58 / 2.43 33.17 / 33.17151 M71.2 M PP-OCRv4的服务器端模型,推理精度高,可以部署在多种不同的服务器上
                86.5881.53 6.65 / 2.38 32.92 / 32.9291 M74.7 M PP-OCRv4_server_rec_doc是在PP-OCRv4_server_rec的基础上,在更多中文文档数据和PP-OCR训练数据的混合数据训练而成,增加了部分繁体字、日文、特殊字符的识别能力,可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力
                PP-OCRv4_mobile_rec推理模型/训练模型83.2878.74 4.82 / 1.20 16.74 / 4.6411 M10.6 M PP-OCRv4的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中
                PP-OCRv4_server_rec 推理模型/训练模型85.19 80.61 6.58 / 2.43 33.17 / 33.1787 M71.2 M PP-OCRv4的服务器端模型,推理精度高,可以部署在多种不同的服务器上
                PP-OCRv3_mobile_rec推理模型/训练模型75.4372.96 5.87 / 1.19 9.07 / 4.2811 M9.2 M PP-OCRv3的轻量级识别模型,推理效率高,可以部署在包含端侧设备的多种硬件设备中
                @@ -516,7 +551,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径) -
              • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
              • -
              +待预测数据,必填。如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_detection_model_name -文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_detection_model_dir -文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + -text_line_orientation_model_name -文本行方向模型的名称。如果设置为None, 将会使用产线默认模型。 +textline_orientation_model_name +文本行方向模型的名称。如果不设置,将会使用产线默认模型。 str -None + -text_line_orientation_model_dir -文本行方向模型的目录路径。如果设置为None, 将会下载官方模型。 +textline_orientation_model_dir +文本行方向模型的目录路径。如果不设置,将会下载官方模型。 str -None + -text_line_orientation_batch_size -文本行方向模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +textline_orientation_batch_size +文本行方向模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_doc_orientation_classify -是否使用文档方向分类功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否使用文本图像矫正功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_textline_orientation -是否使用文本行方向功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本行方向模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + text_det_limit_side_len -文本检测的最大边长度限制。 -
                -
              • int:大于 0 的任意整数;
              • -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
              • -
              +文本检测的图像边长限制。 +大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 64int -None + text_det_limit_type -文本检测的边长度限制类型。 -
                -
              • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
              • -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
              • -
              +文本检测的边长度限制类型。支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 minstr -None + text_det_thresh 文本检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
              +大于0的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.3float -None + text_det_box_thresh 文本检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
              +大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + text_det_unclip_ratio -文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
              +文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。大于0的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 2.0float -None + text_det_input_shape -文本检测的输入形状。 -tuple -None +文本检测的输入形状,您可以设置3个值代表C,H,W。 +int + text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
              +大于0的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + text_rec_input_shape 文本识别的输入形状。 tuple -None + lang 使用指定语言的 OCR 模型。 -
                -
              • ch:中文; -
              • en:英文; -
              • korean:韩文; -
              • japan:日文; -
              • chinese_cht:繁体中文; -
              • te:泰卢固文; -
              • ka:卡纳达文; -
              • ta:泰米尔文; -
              • None:如果设置为 None, 将默认使用ch
              • -
              +请查看下方的详细语言列表。 str -None + ocr_version -OCR 版本。 +OCR 版本,注意不是每个ocr_version都支持所有的lang
                -
              • PP-OCRv5:使用PP-OCRv5系列模型; -
              • PP-OCRv4:使用PP-OCRv4系列模型; -
              • PP-OCRv3:使用PP-OCRv3系列模型; -
              • None:如果设置为 None, 将默认使用PP-OCRv5系列模型;
              • -
              +
            • PP-OCRv5:使用PP-OCRv5系列模型; +
            • PP-OCRv4:使用PP-OCRv4系列模型; +
            • PP-OCRv3:使用PP-OCRv3系列模型。 str -None + det_model_dir -已废弃,请使用text_detection_model_dir代替。文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +已废弃,请参考text_detection_model_dir,且与新的参数不能同时指定。 str -None + det_limit_side_len -已废弃,请使用text_det_limit_side_len代替。文本检测的最大边长度限制。 +已废弃,请参考text_det_limit_side_len,且与新的参数不能同时指定。 int -None + det_limit_type -已废弃,请使用text_det_limit_type代替。文本检测的边长度限制类型。 -
                -
              • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
              • -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
              • -
              +已废弃,请参考text_det_limit_type,且与新的参数不能同时指定。 str -None + det_db_thresh -已废弃,请使用text_det_thresh代替。文本检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
              +已废弃,请参考text_det_thresh,且与新的参数不能同时指定。 float -None + det_db_box_thresh -已废弃,请使用text_det_box_thresh代替。文本检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
              +已废弃,请参考text_det_box_thresh,且与新的参数不能同时指定。 float -None + det_db_unclip_ratio -已废弃,请使用text_det_unclip_ratio代替。文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
                -
              • float:大于 0 的任意浮点数 -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
              +已废弃,请参考text_det_unclip_ratio,且与新的参数不能同时指定。 float -None + rec_model_dir -已废弃,请使用text_recognition_model_dir代替。文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +已废弃,请参考text_recognition_model_dir,且与新的参数不能同时指定。 str -None + rec_batch_num -已废弃,请使用text_recognition_batch_size代替。文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +已废弃,请参考text_recognition_batch_size,且与新的参数不能同时指定。 int -None + use_angle_cls -已废弃,请使用use_textline_orientation代替。是否使用文本行方向功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +已废弃,请参考use_textline_orientation,且与新的参数不能同时指定。 bool -None + cls_model_dir -已废弃,请使用text_line_orientation_model_dir代替。文本行方向模型的目录路径。如果设置为None, 将会下载官方模型。 +已废弃,请参考textline_orientation_model_dir,且与新的参数不能同时指定。 str -None + cls_batch_num -已废弃,请使用text_line_orientation_batch_size代替。文本行方向模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +已废弃,请参考textline_orientation_batch_size,且与新的参数不能同时指定。 int -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
              • CPU:如 cpu 表示使用 CPU 进行推理;
              • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
              • @@ -848,11 +837,10 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv5
              • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
              • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
              • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
              • -
              • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
              • -
              +
            如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -880,10 +868,10 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv5 enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -895,7 +883,7 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv5 paddlex_config PaddleX产线配置文件路径。 str -None + @@ -929,6 +917,125 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv5 若指定了`save_path`,则会保存可视化结果在`save_path`下。可视化结果如下: +
            支持的语言列表 + + + + + + + + + + + + + + + + + + + + + +
            ocr_version语种
            PP-OCRv5PP-OCRv5支持以下语言: +
              +
            • ch:简体中文; +
            • chinese_cht:繁体中文; +
            • en:英文; +
            • japan:日文; +
            • korean:韩文; +
            • te:泰卢固文; +
            • ka:卡纳达文; +
            • ta:泰米尔文。 +
            PP-OCRv4PP-OCRv4支持以下语言: +
              +
            • ch:简体中文; +
            • en:英文。 +
            PP-OCRv3PP-OCRv3支持以下语言: +
            +语言列表 +
              +
            • af:南非荷兰文; +
            • az:阿塞拜疆文; +
            • bs:波斯尼亚文; +
            • cs:捷克文; +
            • cy:威尔士文; +
            • da:丹麦文; +
            • de:德文; +
            • es:西班牙文; +
            • et:爱沙尼亚文; +
            • fr:法文; +
            • ga:爱尔兰文; +
            • hr:克罗地亚文; +
            • hu:匈牙利文; +
            • id:印度尼西亚文; +
            • is:冰岛文; +
            • it:意大利文; +
            • ku:库尔德文; +
            • la:拉丁文; +
            • lt:立陶宛文; +
            • lv:拉脱维亚文; +
            • mi:毛利文; +
            • ms:马来文; +
            • mt:马耳他文; +
            • nl:荷兰文; +
            • no:挪威文; +
            • oc:奥克文; +
            • pi:帕利亚文; +
            • pl:波兰文; +
            • pt:葡萄牙文; +
            • ro:罗马尼亚文; +
            • rs_latin:塞尔维亚文(latin); +
            • sk:斯洛伐克文; +
            • sl:斯洛文尼亚文; +
            • sq:阿尔巴尼亚文; +
            • sv:瑞典文; +
            • sw:斯瓦希里文; +
            • tl:塔加洛文; +
            • tr:土耳其文; +
            • uz:乌兹别克文; +
            • vi:越南文; +
            • french:法文; +
            • german:德文; +
            • ar:阿拉伯文; +
            • fa:波斯文; +
            • ug:维吾尔文; +
            • ur:乌尔都文; +
            • ru:俄罗斯文; +
            • rs_cyrillic:塞尔维亚文(cyrillic); +
            • be:白俄罗斯文; +
            • bg:保加利亚文; +
            • uk:乌克兰文; +
            • mn:蒙古文; +
            • abq:阿巴扎文; +
            • ady:阿迪赫文; +
            • kbd:卡巴尔达文; +
            • ava:阿瓦尔文; +
            • dar:达尔金文; +
            • inh:印古什文; +
            • che:车臣文; +
            • lbe:拉克文; +
            • lez:莱兹甘文; +
            • tab:塔巴萨兰文; +
            • hi:印地文;
            • +
            • mr:马拉地文;
            • +
            • ne:尼泊尔文;
            • +
            • bh:比哈尔文;
            • +
            • mai:迈蒂利文;
            • +
            • ang:昂加文;
            • +
            • bho:孟加拉文;
            • +
            • mah:摩揭陀文;
            • +
            • sck:那格浦尔文;
            • +
            • new:尼瓦尔文;
            • +
            • gom:保加利亚文;
            • +
            • sa:梵文;
            • +
            • bgc:哈里亚纳文。
            • +
            +
            +
            +
            ### 2.2 Python脚本方式集成 @@ -945,6 +1052,13 @@ ocr = PaddleOCR( # ocr = PaddleOCR(lang="en") # 通过 lang 参数来使用英文模型 # ocr = PaddleOCR(ocr_version="PP-OCRv4") # 通过 ocr_version 参数来使用 PP-OCR 其他版本 # ocr = PaddleOCR(device="gpu") # 通过 device 参数使得在模型推理时使用 GPU +# ocr = PaddleOCR( +# text_detection_model_name="PP-OCRv5_server_det", +# text_recognition_model_name="PP-OCRv5_server_rec", +# use_doc_orientation_classify=False, +# use_doc_unwarping=False, +# use_textline_orientation=False, +# ) # 更换 PP-OCRv5_server 模型 result = ocr.predict("./general_ocr_002.png") for res in result: res.print() @@ -968,101 +1082,101 @@ for res in result: doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 str None text_detection_model_name -文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +文本检测模型的名称。如果设置为None,将会使用产线默认模型。 str None text_detection_model_dir -文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +文本检测模型的目录路径。如果设置为None,将会下载官方模型。 str None -text_line_orientation_model_name -文本行方向模型的名称。如果设置为None, 将会使用产线默认模型。 +textline_orientation_model_name +文本行方向模型的名称。如果设置为None,将会使用产线默认模型。 str None -text_line_orientation_model_dir -文本行方向模型的目录路径。如果设置为None, 将会下载官方模型。 +textline_orientation_model_dir +文本行方向模型的目录路径。如果设置为None,将会下载官方模型。 str None -text_line_orientation_batch_size -文本行方向模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +textline_orientation_batch_size +文本行方向模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果设置为None,将会使用产线默认模型。 str None text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果设置为None,将会下载官方模型。 str None text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None use_doc_orientation_classify -是否使用文档方向分类功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否使用文本图像矫正功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_textline_orientation -是否使用文本行方向功能。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本行方向模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None text_det_limit_side_len -文本检测的最大边长度限制。 +文本检测的图像边长限制。
            • int:大于 0 的任意整数;
            • -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
            • -
            +
          • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 64
          • + int None @@ -1071,9 +1185,9 @@ for res in result: text_det_limit_type 文本检测的边长度限制类型。
              -
            • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
            • -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
            • -
            +
          • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
          • +
          • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 min
          • + str None @@ -1082,8 +1196,8 @@ for res in result: text_det_thresh 文本检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
              -
            • float:大于 0 的任意浮点数 -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
            +
          • float:大于0的任意浮点数; +
          • None:如果设置为None,将默认使用产线初始化的该参数值 0.3
          • float None @@ -1092,8 +1206,8 @@ for res in result: text_det_box_thresh 文本检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
              -
            • float:大于 0 的任意浮点数 -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
            +
          • float:大于0的任意浮点数; +
          • None:如果设置为None,将默认使用产线初始化的该参数值 0.6float None @@ -1102,8 +1216,9 @@ for res in result: text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
              -
            • float:大于 0 的任意浮点数 -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
            +
          • float:大于0的任意浮点数; +
          • None:如果设置为None,将默认使用产线初始化的该参数值 2.0。 + float None @@ -1118,8 +1233,9 @@ for res in result: text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。
              -
            • float:大于 0 的任意浮点数 -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
            +
          • float:大于0的任意浮点数; +
          • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。 + float None @@ -1132,38 +1248,26 @@ for res in result: lang -使用指定语言的 OCR 模型。 -
              -
            • ch:中文; -
            • en:英文; -
            • korean:韩文; -
            • japan:日文; -
            • chinese_cht:繁体中文; -
            • te:泰卢固文; -
            • ka:卡纳达文; -
            • ta:泰米尔文; -
            • None:如果设置为 None, 将默认使用ch
            • -
            +使用指定语言的 OCR 模型。请查看上方的详细列表。 str None ocr_version -OCR 版本。 +OCR 版本,注意不是每个ocr_version都支持所有的lang
              -
            • PP-OCRv5:使用PP-OCRv5系列模型; -
            • PP-OCRv4:使用PP-OCRv4系列模型; -
            • PP-OCRv3:使用PP-OCRv3系列模型; -
            • None:如果设置为 None, 将默认使用PP-OCRv5系列模型;
            • -
            +
          • PP-OCRv5:使用PP-OCRv5系列模型; +
          • PP-OCRv4:使用PP-OCRv4系列模型; +
          • PP-OCRv3:使用PP-OCRv3系列模型; + str None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
            • CPU:如 cpu 表示使用 CPU 进行推理;
            • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
            • @@ -1171,8 +1275,8 @@ for res in result:
            • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
            • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
            • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
            • -
            • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
            • -
            +
          • None:如果设置为None,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 + str None @@ -1199,14 +1303,13 @@ for res in result: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 - +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1239,21 +1342,15 @@ for res in result: input 待预测数据,支持多种输入类型,必填。
              -
            • Python Var:如 numpy.ndarray 表示的图像数据
            • -
            • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
            • -
            • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
            • +
            • Python Var:如 numpy.ndarray 表示的图像数据;
            • +
            • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
            • +
            • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]。
            Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -1320,19 +1417,19 @@ for res in result: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1340,19 +1437,19 @@ for res in result: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1360,58 +1457,59 @@ for res in result: 将结果保存为图像格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 -- 调用`print()` 方法会将结果打印到终端,打印到终端的内容解释如下: - - - `input_path`: `(str)` 待预测图像的输入路径 - - - `page_index`: `(Union[int, None])` 如果输入是PDF文件,则表示当前是PDF的第几页,否则为 `None` - - - `model_settings`: `(Dict[str, bool])` 配置产线所需的模型参数 - - - `use_doc_preprocessor`: `(bool)` 控制是否启用文档预处理子产线 - - `use_textline_orientation`: `(bool)` 控制是否启用文本行方向分类功能 - - - `doc_preprocessor_res`: `(Dict[str, Union[str, Dict[str, bool], int]])` 文档预处理子产线的输出结果。仅当`use_doc_preprocessor=True`时存在 - - `input_path`: `(Union[str, None])` 图像预处理子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None` - - `model_settings`: `(Dict)` 预处理子产线的模型配置参数 - - `use_doc_orientation_classify`: `(bool)` 控制是否启用文档方向分类 - - `use_doc_unwarping`: `(bool)` 控制是否启用文本图像矫正 - - `angle`: `(int)` 文档方向分类的预测结果。启用时取值为[0,1,2,3],分别对应[0°,90°,180°,270°];未启用时为-1 - - - `dt_polys`: `(List[numpy.ndarray])` 文本检测的多边形框列表。每个检测框由4个顶点坐标构成的numpy数组表示,数组shape为(4, 2),数据类型为int16 - - - `dt_scores`: `(List[float])` 文本检测框的置信度列表 - - - `text_det_params`: `(Dict[str, Dict[str, int, float]])` 文本检测模块的配置参数 - - `limit_side_len`: `(int)` 图像预处理时的边长限制值 - - `limit_type`: `(str)` 边长限制的处理方式 - - `thresh`: `(float)` 文本像素分类的置信度阈值 - - `box_thresh`: `(float)` 文本检测框的置信度阈值 - - `unclip_ratio`: `(float)` 文本检测框的膨胀系数 - - `text_type`: `(str)` 文本检测的类型,当前固定为"general" - - - `textline_orientation_angles`: `(List[int])` 文本行方向分类的预测结果。启用时返回实际角度值(如[0,0,1]),未启用时返回[-1,-1,-1] - - - `text_rec_score_thresh`: `(float)` 文本识别结果的过滤阈值 - - - `rec_texts`: `(List[str])` 文本识别结果列表,仅包含置信度超过`text_rec_score_thresh`的文本 - - - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤 - - - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys` - - - `rec_boxes`: `(numpy.ndarray)` 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个矩形框的[x_min, y_min, x_max, y_max]坐标 - ,其中(x_min, y_min)为左上角坐标,(x_max, y_max)为右下角坐标 - -- 调用`save_to_json()` 方法会将上述内容保存到指定的`save_path`中,如果指定为目录,则保存的路径为`save_path/{your_img_basename}_res.json`,如果指定为文件,则直接保存到该文件中。由于json文件不支持保存numpy数组,因此会将其中的`numpy.array`类型转换为列表形式。 -- 调用`save_to_img()` 方法会将可视化结果保存到指定的`save_path`中,如果指定为目录,则保存的路径为`save_path/{your_img_basename}_ocr_res_img.{your_img_extension}`,如果指定为文件,则直接保存到该文件中。(产线通常包含较多结果图片,不建议直接指定为具体的文件路径,否则多张图会被覆盖,仅保留最后一张图) +
              +
            • 调用print() 方法会将结果打印到终端,打印到终端的内容解释如下: +
                +
              • input_path: (str) 待预测图像的输入路径
              • +
              • page_index: (Union[int, None]) 如果输入是PDF文件,则表示当前是PDF的第几页,否则为 None
              • +
              • model_settings: (Dict[str, bool]) 配置产线所需的模型参数 +
                  +
                • use_doc_preprocessor: (bool) 控制是否启用文档预处理子产线
                • +
                • use_textline_orientation: (bool) 控制是否启用文本行方向分类模块
                • +
                +
              • +
              • doc_preprocessor_res: (Dict[str, Union[str, Dict[str, bool], int]]) 文档预处理子产线的输出结果。仅当use_doc_preprocessor=True时存在 +
                  +
                • input_path: (Union[str, None]) 图像预处理子产线接受的图像路径,当输入为numpy.ndarray时,保存为None
                • +
                • model_settings: (Dict) 预处理子产线的模型配置参数 +
                    +
                  • use_doc_orientation_classify: (bool) 控制是否启用文档方向分类
                  • +
                  • use_doc_unwarping: (bool) 控制是否启用文本图像矫正
                  • +
                  +
                • +
                • angle: (int) 文档方向分类的预测结果。启用时取值为[0,1,2,3],分别对应[0°,90°,180°,270°];未启用时为-1
                • +
                +
              • +
              • dt_polys: (List[numpy.ndarray]) 文本检测的多边形框列表。每个检测框由4个顶点坐标构成的numpy数组表示,数组shape为(4, 2),数据类型为int16
              • +
              • dt_scores: (List[float]) 文本检测框的置信度列表
              • +
              • text_det_params: (Dict[str, Dict[str, int, float]]) 文本检测模块的配置参数 +
                  +
                • limit_side_len: (int) 图像预处理时的边长限制值
                • +
                • limit_type: (str) 边长限制的处理方式
                • +
                • thresh: (float) 文本像素分类的置信度阈值
                • +
                • box_thresh: (float) 文本检测框的置信度阈值
                • +
                • unclip_ratio: (float) 文本检测框的膨胀系数
                • +
                • text_type: (str) 文本检测的类型,当前固定为"general"
                • +
                +
              • +
              • textline_orientation_angles: (List[int]) 文本行方向分类的预测结果。启用时返回实际角度值(如[0,0,1]),未启用时返回[-1,-1,-1]
              • +
              • text_rec_score_thresh: (float) 文本识别结果的过滤阈值
              • +
              • rec_texts: (List[str]) 文本识别结果列表,仅包含置信度超过text_rec_score_thresh的文本
              • +
              • rec_scores: (List[float]) 文本识别的置信度列表,已按text_rec_score_thresh过滤
              • +
              • rec_polys: (List[numpy.ndarray]) 经过置信度过滤的文本检测框列表,格式同dt_polys
              • +
              • rec_boxes: (numpy.ndarray) 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个矩形框的[x_min, y_min, x_max, y_max]坐标,其中(x_min, y_min)为左上角坐标,(x_max, y_max)为右下角坐标
              • +
              +
            • +
            • 调用save_to_json() 方法会将上述内容保存到指定的save_path中,如果指定为目录,则保存的路径为save_path/{your_img_basename}_res.json,如果指定为文件,则直接保存到该文件中。由于json文件不支持保存numpy数组,因此会将其中的numpy.array类型转换为列表形式。
            • +
            • 调用save_to_img() 方法会将可视化结果保存到指定的save_path中,如果指定为目录,则保存的路径为save_path/{your_img_basename}_ocr_res_img.{your_img_extension},如果指定为文件,则直接保存到该文件中。(产线通常包含较多结果图片,不建议直接指定为具体的文件路径,否则多张图会被覆盖,仅保留最后一张图)
            • +
            -* 此外,也支持通过属性获取带结果的可视化图像和预测结果,具体如下: +

            此外,也支持通过属性获取带结果的可视化图像和预测结果,具体如下:

            @@ -1430,8 +1528,10 @@ for res in result:
            -- `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `ocr_res_img` 和 `preprocessed_img`,对应的值是两个 `Image.Image` 对象:一个用于显示 OCR 结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则字典中只包含 `ocr_res_img`。 +
              +
            • json 属性获取的预测结果为dict类型的数据,相关内容与调用 save_to_json() 方法保存的内容一致。
            • +
            • img 属性返回的预测结果是一个dict类型的数据。其中,键分别为 ocr_res_imgpreprocessed_img,对应的值是两个 Image.Image 对象:一个用于显示 OCR 结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则dict中只包含 ocr_res_img
            • +
            @@ -1742,9 +1842,42 @@ for i, res in enumerate(result["ocrResults"]): ### 4.2 模型应用 -当您使用私有数据集完成微调训练后,可获得本地模型权重文件,然后可以通过自定义产线配置文件的方式,使用微调后的模型权重。 -1. 获取产线配置文件 +当您使用私有数据集完成微调训练后,可获得本地模型权重文件,然后可以通过参数指定本地模型保存路径的方式,或者通过自定义产线配置文件的方式,使用微调后的模型权重。 + +#### 4.2.1 通过参数指定本地模型路径 + +在初始化产线对象时,通过参数指定本地模型路径。以文本检测模型微调后的权重的使用方法为例,示例如下: + +命令行方式: + +```bash +# 通过 --text_detection_model_dir 指定本地模型路径 +paddleocr ocr -i ./general_ocr_002.png --text_detection_model_dir your_det_model_path + +# 默认使用 PP-OCRv5_server_det 模型作为默认文本检测模型,如果微调的不是该模型,通过 --text_detection_model_name 修改模型名称 +paddleocr ocr -i ./general_ocr_002.png --text_detection_model_name PP-OCRv5_mobile_det --text_detection_model_dir your_v5_mobile_det_model_path +``` + +脚本方式: + +```python + +from paddleocr import PaddleOCR + +# 通过 text_detection_model_dir 指定本地模型路径 +pipeline = PaddleOCR(text_detection_model_dir="./your_det_model_path") + +# 默认使用 PP-OCRv5_server_det 模型作为默认文本检测模型,如果微调的不是该模型,通过 text_detection_model_name 修改模型名称 +# pipeline = PaddleOCR(text_detection_model_name="PP-OCRv5_mobile_det", text_detection_model_dir="./your_v5_mobile_det_model_path") + +``` + + +#### 4.2.2 通过配置文件指定本地模型路径 + + +1.获取产线配置文件 可调用 PaddleOCR 中 通用OCR 产线对象的 `export_paddlex_config_to_yaml` 方法,将当前产线配置导出为 YAML 文件: @@ -1755,7 +1888,7 @@ pipeline = PaddleOCR() pipeline.export_paddlex_config_to_yaml("PaddleOCR.yaml") ``` -2. 修改配置文件 +2.修改配置文件 在得到默认的产线配置文件后,将微调后模型权重的本地路径替换至产线配置文件中的对应位置即可。例如 @@ -1764,23 +1897,23 @@ pipeline.export_paddlex_config_to_yaml("PaddleOCR.yaml") SubModules: TextDetection: box_thresh: 0.6 - limit_side_len: 960 - limit_type: max + limit_side_len: 64 + limit_type: min max_side_limit: 4000 model_dir: null # 替换为微调后的文本测模型权重路径 - model_name: PP-OCRv5_server_det + model_name: PP-OCRv5_server_det # 如果微调的模型名称与默认模型名称不同,请一并修改此处 module_name: text_detection thresh: 0.3 unclip_ratio: 1.5 TextLineOrientation: batch_size: 6 - model_dir: null - model_name: PP-LCNet_x0_25_textline_ori + model_dir: null # 替换为微调后的文本行方向分类模型权重路径 + model_name: PP-LCNet_x1_0_textline_ori # 如果微调的模型名称与默认模型名称不同,请一并修改此处 module_name: textline_orientation TextRecognition: batch_size: 6 model_dir: null # 替换为微调后的文本识模型权重路径 - model_name: PP-OCRv5_server_rec + model_name: PP-OCRv5_server_rec # 如果微调的模型名称与默认模型名称不同,请一并修改此处 module_name: text_recognition score_thresh: 0.0 ...... @@ -1788,7 +1921,7 @@ SubModules: 在产线配置文件中,不仅包含 PaddleOCR CLI 和 Python API 支持的参数,还可进行更多高级配置,具体信息可在 [PaddleX模型产线使用概览](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_usage/pipeline_develop_guide.html) 中找到对应的产线使用教程,参考其中的详细说明,根据需求调整各项配置。 -3. 在 CLI 中加载产线配置文件 +3.在 CLI 中加载产线配置文件 在修改完成配置文件后,通过命令行的 --paddlex_config 参数指定修改后的产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线配置。示例如下: @@ -1796,9 +1929,9 @@ SubModules: paddleocr ocr --paddlex_config PaddleOCR.yaml ... ``` -4. 在 Python API 中加载产线配置文件 +4.在 Python API 中加载产线配置文件 -初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置字典,PaddleOCR 会读取其中的内容作为产线配置。示例如下: +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: ```python from paddleocr import PaddleOCR diff --git a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md index 32b0344f3e2992020f79050f29639c1b72ef641a..94ccef0b6c18aeedcc27ef3925ece43635527d97 100644 --- a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md +++ b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md @@ -1,4 +1,7 @@ -# PP-ChatOCRv4-doc Pipeline Tutorial +--- +comments: true +--- +# PP-ChatOCRv4-doc Pipeline Usage Tutorial ## 1. Introduction to PP-ChatOCRv4-doc Pipeline PP-ChatOCRv4-doc is a unique document and image intelligent analysis solution from PaddlePaddle, combining LLM, MLLM, and OCR technologies to address complex document information extraction challenges such as layout analysis, rare characters, multi-page PDFs, tables, and seal recognition. Integrated with ERNIE Bot, it fuses massive data and knowledge, achieving high accuracy and wide applicability. This pipeline also provides flexible service deployment options, supporting deployment on various hardware. Furthermore, it offers custom development capabilities, allowing you to train and fine-tune models on your own datasets, with seamless integration of trained models. @@ -425,7 +428,7 @@ The RepSVTR text recognition model is a mobile-oriented text recognition model b
            • Performance Test Environment
                -
              • Test Dataset: +
              • Test Dataset:
                • Text Image Rectification Model: DocUNet
                • Layout Region Detection Model: A self-built layout analysis dataset using PaddleOCR, containing 10,000 images of common document types such as Chinese and English papers, magazines, and research reports.
                • @@ -440,7 +443,7 @@ The RepSVTR text recognition model is a mobile-oriented text recognition model b
                • Seal Text Detection Model: A self-built dataset using PaddleOCR, containing 500 images of circular seal textures.
              • -
              • Hardware Configuration: +
              • Hardware Configuration:
                • GPU: NVIDIA Tesla T4
                • CPU: Intel Xeon Gold 6271C @ 2.60GHz
                • @@ -486,7 +489,7 @@ The pre-trained pipelines provided by PaddleOCR allow for quick experience of th Before using the PP-ChatOCRv4-doc pipeline locally, ensure you have completed the installation of the PaddleOCR wheel package according to the [PaddleOCR Local Installation Tutorial](../installation.en.md). If you wish to selectively install dependencies, please refer to the relevant instructions in the installation guide. The dependency group corresponding to this pipeline is `ie`. -Before performing model inference, you first need to prepare the API key for the large language model. PP-ChatOCRv4 supports large model services on the [Baidu Cloud Qianfan Platform](https://console.bce.baidu.com/qianfan/ais/console/onlineService) or the locally deployed standard OpenAI interface. If using the Baidu Cloud Qianfan Platform, refer to [Authentication and Authorization](https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Um2wxbaps_en) to obtain the API key. If using a locally deployed large model service, refer to the [PaddleNLP Large Model Deployment Documentation](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm) for deployment of the dialogue interface and vectorization interface for large models, and fill in the corresponding `base_url` and `api_key`. If you need to use a multimodal large model for data fusion, refer to the OpenAI service deployment in the [PaddleMIX Model Documentation](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee) for multimodal large model deployment, and fill in the corresponding `base_url` and `api_key`. +Before performing model inference, you first need to prepare the API key for the large language model. PP-ChatOCRv4 supports large model services on the [Baidu Cloud Qianfan Platform](https://console.bce.baidu.com/qianfan/ais/console/onlineService) or the locally deployed standard OpenAI interface. If using the Baidu Cloud Qianfan Platform, refer to [Authentication and Authorization](https://cloud.baidu.com/doc/qianfan-api/s/ym9chdsy5) to obtain the API key. If using a locally deployed large model service, refer to the [PaddleNLP Large Model Deployment Documentation](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm) for deployment of the dialogue interface and vectorization interface for large models, and fill in the corresponding `base_url` and `api_key`. If you need to use a multimodal large model for data fusion, refer to the OpenAI service deployment in the [PaddleMIX Model Documentation](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2) for multimodal large model deployment, and fill in the corresponding `base_url` and `api_key`. **Note**: If local deployment of a multimodal large model is restricted due to the local environment, you can comment out the lines containing the `mllm` variable in the code and only use the large language model for information extraction. @@ -503,322 +506,390 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 ```
                  The command line supports more parameter configurations. Click to expand for a detailed explanation of the command line parameters. - - - - - + + + - - - - + + - - - - - + + + + - - - + - +Specify the path to save the inference results file. If not set, the inference results will not be saved locally. + + - - - - - + + + + - - - + + + + + + + - + + - - - - + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + - - - - + + + - - - - + + - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - +don’t + - - - - + + - - - - + + - - - - + + - - - - + + - - - + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  ParameterParameter DescriptionParameter TypeOptionsDefault ValueDescriptionTypeDefault
                  inputThe data to be predicted, supporting multiple input types, required.Python Var|str|list -
                    -
                  • Python Var: Such as numpy.ndarray representing image data.
                  • -
                  • str: Such as the local path of an image file or PDF file: /root/data/img.jpg; URL link, such as the network URL of an image file or PDF file: Example; Local directory, which should contain images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories, PDF files need to be specified to the specific file path).
                  • -
                  • List: List elements need to be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
                  • -
                  +
                  Data to be predicted, required. Such as the local path of an image file or PDF file: /root/data/img.jpg; URL link, such as the network URL of an image file or PDF file: Example; Local directory, which should contain images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories, PDF files need to be specified to the specific file path). Nonestr
                  deviceThe device for pipeline inference.str|None -
                    -
                  • CPU: Such as cpu to use CPU for inference;
                  • -
                  • GPU: Such as gpu:0 to use the first GPU for inference;
                  • -
                  • NPU: Such as npu:0 to use the first NPU for inference;
                  • -
                  • XPU: Such as xpu:0 to use the first XPU for inference;
                  • -
                  • MLU: Such as mlu:0 to use the first MLU for inference;
                  • -
                  • DCU: Such as dcu:0 to use the first DCU for inference;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline. During initialization, it will prioritize using the local GPU 0 device, and if not available, it will use the CPU device;
                  • -
                  -
                  NonekeysKeys for information extraction.str
                  use_doc_orientation_classifyWhether to use the document orientation classification module.bool|Nonesave_path -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  -
                  Nonestr
                  use_doc_unwarpingWhether to use the document distortion correction module.bool|None -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  -
                  Noneinvoke_mllmWhether to load and use a multimodal large model. If not set, the default is False.bool
                  use_textline_orientationWhether to use the text line orientation classification module.bool|Nonelayout_detection_model_name -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  +The name of the layout detection model. If not set, the default model in pipeline will be used.
                  str
                  layout_detection_model_dir The directory path of the layout detection model. If not set, the official model will be downloaded. Nonestr
                  use_general_ocrWhether to use the OCR sub-pipeline.bool|None -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  +
                  doc_orientation_classify_model_name +The name of the document orientation classification model. If not set, the default model in pipeline will be used.str
                  doc_orientation_classify_model_dirThe directory path of the document orientation classification model. If not set, the official model will be downloaded.str
                  doc_unwarping_model_name The name of the text image unwarping model. If not set, the default model in pipeline will be used.str
                  doc_unwarping_model_dir The directory path of the text image unwarping model. If not set, the official model will be downloaded. Nonestr
                  text_detection_model_nameName of the text detection model. If not set, the pipeline's default model will be used.str
                  text_detection_model_dirDirectory path of the text detection model. If not set, the official model will be downloaded.str
                  text_recognition_model_nameName of the text recognition model. If not set, the pipeline's default model will be used.str
                  text_recognition_model_dirDirectory path of the text recognition model. If not set, the official model will be downloaded.str
                  text_recognition_batch_sizeBatch size for the text recognition model. If not set, the default batch size will be 1.int
                  table_structure_recognition_model_nameName of the table structure recognition model. If not set, the official model will be downloaded.str
                  table_structure_recognition_model_dirDirectory path of the table structure recognition model. If not set, the official model will be downloaded.str
                  seal_text_detection_model_nameThe name of the seal text detection model. If not set, the pipeline's default model will be used.str
                  seal_text_detection_model_dirThe directory path of the seal text detection model. If not set, the official model will be downloaded.str
                  seal_text_recognition_model_nameThe name of the seal text recognition model. If not set, the default model of the pipeline will be used.str
                  seal_text_recognition_model_dirThe directory path of the seal text recognition model. If not set, the official model will be downloaded.str
                  seal_text_recognition_batch_sizeThe batch size for the seal text recognition model. If not set, the batch size will default to 1.int
                  use_doc_orientation_classifyWhether to load and use the document orientation classification module. If not set, the parameter value initialized by the pipeline will be used by default, initialized as True.bool
                  use_doc_unwarpingWhether to load and use the text image unwarping module. If not set, the parameter value initialized by the pipeline will be used by default, initialized as True.bool
                  use_textline_orientationWhether to load and use the text line orientation classification module. If not set, the parameter value initialized by the pipeline will be used by default, initialized as True.bool
                  use_seal_recognitionWhether to use the seal recognition sub-pipeline.bool|None -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  -
                  NoneWhether to load and use the seal recognition sub-pipeline. If not set, the parameter's value initialized during pipeline setup will be used, defaulting to True.bool
                  use_table_recognitionWhether to use the table recognition sub-pipeline.bool|None -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  -
                  NoneWhether to load and use the table recognition sub-pipeline. If not set, the parameter's value initialized during pipeline setup will be used, defaulting to True.bool
                  layout_thresholdThe score threshold for the layout model.float|dict|None -
                    -
                  • float: Any floating-point number between 0-1;
                  • -
                  • dict: {0:0.1} where the key is the category ID and the value is the threshold for that category;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.5;
                  • -
                  +
                  Score threshold for the layout model. Any value between 0-1. If not set, the default value is used, which is 0.5. Nonefloat
                  layout_nmsWhether to use NMS.bool|None -
                    -
                  • bool: True or False;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to True;
                  • -
                  +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If not set, the parameter will default to the value initialized in the pipeline, which is set to True by default.
                  Nonebool
                  layout_unclip_ratioThe expansion coefficient for layout detection.float|Tuple[float,float]|dict|None -
                    -
                  • float: Any floating-point number greater than 0;
                  • -
                  • Tuple[float,float]: The expansion coefficients in the horizontal and vertical directions, respectively;
                  • -
                  • dict, keys as int representing cls_id, values as float scaling factors for each category.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 1.0;
                  • -
                  +
                  Unclip ratio for detected boxes in layout detection model. Any float > 0. If not set, the default is 1.0. Nonefloat
                  layout_merge_bboxes_modeThe method for filtering overlapping bounding boxes.str|dict|None +The merging mode for the detection boxes output by the model in layout region detection.
                    -
                  • str: large, small, union. Respectively representing retaining the larger box, smaller box, or both when overlapping boxes are filtered.
                  • -
                  • dict, keys as int representing cls_id and values as merging modes for each category.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to large;
                  • -
                  +
                • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed;
                • +
                • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed;
                • +
                • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained;
                • +If not set, the default is large.
                  Nonestr
                  text_det_limit_side_lenThe side length limit for text detection images.int|None -
                    -
                  • int: Any integer greater than 0;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 960;
                  • -
                  +
                  Image side length limitation for text detection. +Any integer greater than 0. If not set, the pipeline's initialized value for this parameter (initialized to 960) will be used. Noneint
                  text_det_limit_typeThe type of side length limit for text detection images.str|None -
                    -
                  • str: Supports min and max, where min ensures that the shortest side of the image is not less than det_limit_side_len, and max ensures that the longest side of the image is not greater than limit_side_len.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to max;
                  • -
                  +
                  Type of side length limit for text detection. +Supports min and max. min means ensuring the shortest side of the image is not smaller than det_limit_side_len, and max means ensuring the longest side of the image is not larger than limit_side_len. If not set, the pipeline's initialized value for this parameter (initialized to max) will be used. Nonestr
                  text_det_threshThe pixel threshold for detection. In the output probability map, pixel points with scores greater than this threshold will be considered as text pixels.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.3.
                  • -
                  +
                  Pixel threshold for text detection. In the output probability map, pixels with scores higher than this threshold will be considered text pixels. +Any floating-point number greater than 0 +. If not set, the pipeline's initialized value for this parameter (0.3) will be used. Nonefloat
                  text_det_box_threshThe bounding box threshold for detection. When the average score of all pixel points within the detection result bounding box is greater than this threshold, the result will be considered as a text region.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.6.
                  • -
                  +
                  Text detection box threshold. If the average score of all pixels within the detected result boundary is higher than this threshold, the result will be considered a text region. + Any floating-point number greater than 0. If not set, the pipeline's initialized value for this parameter (0.6) will be used. Nonefloat
                  text_det_unclip_ratioThe expansion coefficient for text detection. This method is used to expand the text region, and the larger the value, the larger the expansion area.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 2.0.
                  • -
                  - +
                  Text detection expansion coefficient. This method is used to expand the text region—the larger the value, the larger the expanded area. +Any floating-point number greater than 0 +. If not set, the pipeline's initialized value for this parameter (2.0) will be used. Nonefloat
                  text_rec_score_threshThe text recognition threshold. Text results with scores greater than this threshold will be retained.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.0. I.e., no threshold is set.
                  • -
                  - +
                  Text recognition threshold. Text results with scores higher than this threshold will be retained. + Any floating-point number greater than 0 +. If not set, the pipeline's initialized value for this parameter (0.0, i.e., no threshold) will be used. Nonefloat
                  seal_det_limit_side_lenThe side length limit for seal detection images.int|None -
                    -
                  • int: Any integer greater than 0;
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 960;
                  • -
                  +
                  Image side length limit for seal text detection. +Any integer > 0. If not set, the default is 736. Noneint
                  seal_det_limit_typeThe type of side length limit for seal detection images.str|None -
                    -
                  • str: Supports min and max, where min ensures that the shortest side of the image is not less than det_limit_side_len, and max ensures that the longest side of the image is not greater than limit_side_len.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to max;
                  • -
                  +
                  Limit type for image side in seal text detection. +supports min and max; min ensures shortest side ≥ det_limit_side_len, max ensures longest side ≤ limit_side_len. If not set, the default is min. Nonestr
                  seal_det_threshThe pixel threshold for detection. In the output probability map, pixel points with scores greater than this threshold will be considered as seal pixels.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.3.
                  • -
                  +
                  Pixel threshold. Pixels with scores above this value in the probability map are considered text. +Any float > 0 +If not set, the default is 0.2. Nonefloat
                  seal_det_box_threshThe bounding box threshold for detection. When the average score of all pixel points within the detection result bounding box is greater than this threshold, the result will be considered as a seal region.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.6.
                  • -
                  +
                  Box threshold. Boxes with average pixel scores above this value are considered text regions.Any float > 0. If not set, the default is 0.6. Nonefloat
                  seal_det_unclip_ratioThe expansion coefficient for seal detection. This method is used to expand the seal region, and the larger the value, the larger the expansion area.float|None -
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 2.0.
                  • -
                  - +
                  Expansion ratio for seal text detection. Higher value means larger expansion area. +any float > 0. If not set, the default is 0.5. Nonefloat
                  seal_rec_score_threshThe seal recognition threshold. Text results with scores greater than this threshold will be retained.float|None +Recognition score threshold. Text results above this value will be kept. +Any float > 0 +If not set, the default is 0.0 (no threshold). +float
                  qianfan_api_keyAPI key for the Qianfan Platform.str
                  pp_docbee_base_urlConfiguration for the multimodal large language model.str
                  deviceThe device used for inference. You can specify a particular card number:
                    -
                  • float: Any floating-point number greater than 0.
                  • -
                  • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.0. I.e., no threshold is set.
                  • -
                  - +
                • CPU: e.g., cpu indicates using CPU for inference;
                • +
                • GPU: e.g., gpu:0 indicates using the 1st GPU for inference;
                • +
                • NPU: e.g., npu:0 indicates using the 1st NPU for inference;
                • +
                • XPU: e.g., xpu:0 indicates using the 1st XPU for inference;
                • +
                • MLU: e.g., mlu:0 indicates using the 1st MLU for inference;
                • +
                • DCU: e.g., dcu:0 indicates using the 1st DCU for inference;
                • +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
                  Nonestr
                  enable_hpiWhether to enable the high-performance inference plugin.boolFalse
                  use_tensorrtWhether to use TensorRT for inference acceleration.boolFalse
                  min_subgraph_sizeMinimum subgraph size for optimizing the computation of model subgraphs. int3
                  precisionCompute precision, such as FP32 or FP16.strfp32
                  enable_mkldnnWhether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +boolTrue
                  cpu_threads +The number of threads to use when performing inference on the CPU.int8
                  paddlex_configPath to PaddleX pipeline configuration file.str
                  +
                  This method will print the results to the terminal. The content printed to the terminal is explained as follows: @@ -1029,25 +1100,31 @@ The relevant parameter descriptions are as follows: use_doc_orientation_classify -Whether to load the document orientation classification function. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). +Whether to load and use the document orientation classification module. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). bool None use_doc_unwarping -Whether to load the document unwarping function. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). +Whether to load and use the document unwarping module. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). +bool +None + + +use_textline_orientation +Whether to load and use the text line orientation classification function. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). bool None use_seal_recognition -Whether to load the seal recognition sub-pipeline. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). +Whether to load and use the seal recognition sub-pipeline. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). bool None use_table_recognition -Whether to load the table recognition sub-pipeline. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). +Whether to load and use the table recognition sub-pipeline. If set toNone, the value initialized by the pipeline for this parameter will be used by default (initialized to True). bool None @@ -1055,9 +1132,9 @@ The relevant parameter descriptions are as follows: layout_threshold Layout model score threshold.
                    -
                  • float:Any float between 0-1;
                  • -
                  • dict{0:0.1} where key is the class ID, and value is the threshold for that class;
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 0.5);
                  • +
                  • float: Any float between 0-1;
                  • +
                  • dict: {0:0.1} where the key is the class ID and the value is the threshold for that class;
                  • +
                  • None: If set to None, uses the pipeline default of 0.5.
                  float|dict @@ -1065,7 +1142,7 @@ The relevant parameter descriptions are as follows: layout_nms -Whether the layout region detection model uses NMS post-processing. +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If set to None, the parameter will default to the value initialized in the pipeline, which is set to True by default. bool None @@ -1073,10 +1150,10 @@ The relevant parameter descriptions are as follows: layout_unclip_ratio Expansion factor for the detection boxes of the layout region detection model.
                    -
                  • float:Any float greater than 0;
                  • -
                  • Tuple[float,float]:Expansion factors in the horizontal and vertical directions respectively;
                  • -
                  • dict, where the key is of int type, representing cls_id, and the value is of tuple type, e.g.,{0: (1.1, 2.0)}, meaning the center of the detection box for class 0 remains unchanged, width is expanded by 1.1 times, and height by 2.0 times.
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 1.0);
                  • +
                  • float: Any float greater than 0;
                  • +
                  • Tuple[float,float]: Expansion ratios in horizontal and vertical directions;
                  • +
                  • dict: A dictionary with int keys representing cls_id, and tuple values, e.g., {0: (1.1, 2.0)} means width is expanded 1.1× and height 2.0× for class 0 boxes;
                  • +
                  • None: If set to None, uses the pipeline default of 1.0.
                  float|Tuple[float,float]|dict @@ -1086,9 +1163,9 @@ The relevant parameter descriptions are as follows: layout_merge_bboxes_mode Method for filtering overlapping boxes in layout region detection.
                    -
                  • strlargesmall, union, representing whether to keep the large box, small box, or both when filtering overlapping boxes.
                  • -
                  • dict, where the key is of int type, representing cls_id, and the value is of str type, e.g.,{0: "large", 2: "small"}, meaning use "large" mode for class 0 detection boxes and "small" mode for class 2 detection boxes.
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to large);
                  • +
                  • str: large,small, union, representing whether to keep the large box, small box, or both when filtering overlapping boxes;
                  • +
                  • dict, where the key is of int type, representing cls_id, and the value is of str type, e.g.,{0: "large", 2: "small"}, meaning use "large" mode for class 0 detection boxes and "small" mode for class 2 detection boxes;
                  • +
                  • None: If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to large).
                  str|dict @@ -1096,10 +1173,10 @@ The relevant parameter descriptions are as follows: text_det_limit_side_len -Maximum side length limit for text detection. +Image side length limitation for text detection.
                    -
                  • int:Any integer greater than 0;
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 960);
                  • +
                  • int: Any integer greater than 0;
                  • +
                  • None: If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 960).
                  int @@ -1109,8 +1186,8 @@ The relevant parameter descriptions are as follows: text_det_limit_type Type of side length limit for text detection.
                    -
                  • str:Supports min and max. min ensures the shortest side of the image is not less than det_limit_side_len. max ensures the longest side of the image is not greater than limit_side_len.
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to max).
                  • +
                  • str: Supports min and max. min ensures the shortest side of the image is not less than det_limit_side_len. max ensures the longest side of the image is not greater than limit_side_len;
                  • +
                  • None: If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to max).
                  str @@ -1120,8 +1197,8 @@ The relevant parameter descriptions are as follows: text_det_thresh Detection pixel threshold. In the output probability map, pixels with scores greater than this threshold are considered text pixels.
                    -
                  • float:Any float greater than 0.
                  • -
                  • None:If set to None, the value initialized by the pipeline for this parameter (0.3) will be used by default.
                  +
                • float: Any float greater than 0;
                • +
                • None: If set to None, the value initialized by the pipeline for this parameter (0.3) will be used by default.
                float None @@ -1130,8 +1207,8 @@ The relevant parameter descriptions are as follows: text_det_box_thresh Detection box threshold. If the average score of all pixels within a detection result's bounding box is greater than this threshold, the result is considered a text region.
                  -
                • float:Any float greater than 0.
                • -
                • None:If set to None, the value initialized by the pipeline for this parameter (0.6) will be used by default.
                +
              • float: Any float greater than 0;
              • +
              • None: If set to None, the value initialized by the pipeline for this parameter (0.6) will be used by default.
              float None @@ -1140,8 +1217,8 @@ The relevant parameter descriptions are as follows: text_det_unclip_ratio Text detection expansion factor. This method is used to expand text regions; the larger the value, the larger the expanded area.
                -
              • float:Any float greater than 0.
              • -
              • None:If set to None, the value initialized by the pipeline for this parameter (2.0) will be used by default.
              +
            • float: Any float greater than 0;
            • +
            • None: If set to None, the value initialized by the pipeline for this parameter (2.0) will be used by default.
            float None @@ -1150,8 +1227,8 @@ The relevant parameter descriptions are as follows: text_rec_score_thresh Text recognition threshold. Text results with scores greater than this threshold will be kept.
              -
            • float:Any float greater than 0.
            • -
            • None:If set to None, the value initialized by the pipeline for this parameter (0.0, i.e., no threshold) will be used by default.
            +
          • float: Any float greater than 0;
          • +
          • None: If set to None, the value initialized by the pipeline for this parameter (0.0, i.e., no threshold) will be used by default.
          float None @@ -1160,8 +1237,8 @@ The relevant parameter descriptions are as follows: seal_det_limit_side_len Image side length limit for seal text detection.
            -
          • int:Any integer greater than 0;
          • -
          • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 736);
          • +
          • int: Any integer greater than 0;
          • +
          • None: If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to 736).
          int @@ -1171,8 +1248,8 @@ The relevant parameter descriptions are as follows: seal_det_limit_type Type of image side length limit for seal text detection.
            -
          • str:Supports min and max. min ensures the shortest side of the image is not less than det_limit_side_len. max ensures the longest side of the image is not greater than limit_side_len.
          • -
          • None:If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to min);
          • +
          • str: Supports min and max. min ensures the shortest side of the image is not less than det_limit_side_len. max ensures the longest side of the image is not greater than limit_side_len;
          • +
          • None: If set to None, the value initialized by the pipeline for this parameter will be used by default (initialized to min).
          str @@ -1182,8 +1259,8 @@ The relevant parameter descriptions are as follows: seal_det_thresh Detection pixel threshold. In the output probability map, pixels with scores greater than this threshold are considered text pixels.
            -
          • float:Any float greater than 0. -
          • None:If set to None, the value initialized by the pipeline for this parameter (0.2) will be used by default.
          +
        • float: Any float greater than 0; +
        • None: If set to None, the value initialized by the pipeline for this parameter (0.2) will be used by default.
        float None @@ -1192,8 +1269,8 @@ The relevant parameter descriptions are as follows: seal_det_box_thresh Detection box threshold. If the average score of all pixels within a detection result's bounding box is greater than this threshold, the result is considered a text region.
          -
        • float:Any float greater than 0. -
        • None:If set to None, the value initialized by the pipeline for this parameter (0.6) will be used by default.
        +
      • float: Any float greater than 0; +
      • None: If set to None, the value initialized by the pipeline for this parameter (0.6) will be used by default.
      float None @@ -1202,8 +1279,8 @@ The relevant parameter descriptions are as follows: seal_det_unclip_ratio Seal text detection expansion factor. This method is used to expand text regions; the larger the value, the larger the expanded area.
        -
      • float:Any float greater than 0. -
      • None:If set to None, the value initialized by the pipeline for this parameter (0.5) will be used by default.
      +
    • float: Any float greater than 0; +
    • None: If set to None, the value initialized by the pipeline for this parameter (0.5) will be used by default.
    float None @@ -1212,8 +1289,8 @@ The relevant parameter descriptions are as follows: seal_rec_score_thresh Seal text recognition threshold. Text results with scores greater than this threshold will be kept.
      -
    • float:Any float greater than 0. -
    • None:If set to None, the value initialized by the pipeline for this parameter (0.0, i.e., no threshold) will be used by default.
    +
  • float: Any float greater than 0; +
  • None: If set to None, the value initialized by the pipeline for this parameter (0.0, i.e., no threshold) will be used by default.
  • float None @@ -1261,34 +1338,16 @@ The relevant parameter descriptions are as follows: None -input -Data to be predicted, supports multiple input types, required. -
      -
    • Python Var:e.g., image data represented by numpy.ndarray
    • -
    • str:e.g., local path of an image file or PDF file: /root/data/img.jpgURL link, e.g., network URL of an image file or PDF file: ExampleLocal directory, which must contain images to be predicted, e.g., local path: /root/data/ (Currently, prediction from directories containing PDF files is not supported; PDF files need to be specified by their full path)
    • -
    • List:List elements must be of the above types, e.g.,[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    - -Python Var|str|list -None - - -save_path -Specifies the path to save the inference result file. If set toNone, inference results will not be saved locally. -str -None - - device -Device used for inference. Supports specifying a specific card number. +Device used for inference. Supports specifying a specific card number:
      -
    • CPU:e.g., cpu indicates using CPU for inference;
    • -
    • GPU:e.g., gpu:0 indicates using the 1st GPU for inference;
    • -
    • NPU:e.g., npu:0 indicates using the 1st NPU for inference;
    • -
    • XPU:e.g., xpu:0 indicates using the 1st XPU for inference;
    • -
    • MLU:e.g., mlu:0 indicates using the 1st MLU for inference;
    • -
    • DCU:e.g., dcu:0 indicates using the 1st DCU for inference;
    • -
    • None:If set to None, the value initialized by the pipeline for this parameter will be used by default. During initialization, it will prioritize using the local GPU 0 device; if not available, it will use the CPU device;
    • +
    • CPU: e.g., cpu indicates using CPU for inference;
    • +
    • GPU: e.g., gpu:0 indicates using the 1st GPU for inference;
    • +
    • NPU: e.g., npu:0 indicates using the 1st NPU for inference;
    • +
    • XPU: e.g., xpu:0 indicates using the 1st XPU for inference;
    • +
    • MLU: e.g., mlu:0 indicates using the 1st MLU for inference;
    • +
    • DCU: e.g., dcu:0 indicates using the 1st DCU for inference;
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str @@ -1316,14 +1375,14 @@ The relevant parameter descriptions are as follows: precision Computation precision, e.g., fp32, fp16. str -fp32 +"fp32" enable_mkldnn -Whether to enable MKL-DNN acceleration library. If set toNone, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -1356,21 +1415,15 @@ The relevant parameter descriptions are as follows: input Data to be predicted, supports multiple input types, required.
      -
    • Python Var:e.g., image data represented by numpy.ndarray
    • -
    • str:e.g., local path of an image file or PDF file: /root/data/img.jpgURL link, e.g., network URL of an image file or PDF file: ExampleLocal directory, which must contain images to be predicted, e.g., local path: /root/data/ (Currently, prediction from directories containing PDF files is not supported; PDF files need to be specified by their full path)
    • -
    • List:List elements must be of the above types, e.g.,[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var: e.g., image data represented by numpy.ndarray;
    • +
    • str: e.g., local path of an image file or PDF file: /root/data/img.jpg; URL link, e.g., network URL of an image file or PDF file: Example; Local directory, which must contain images to be predicted, e.g., local path: /root/data/ (Currently, prediction from directories containing PDF files is not supported; PDF files need to be specified by their full path);
    • +
    • List: List elements must be of the above types, e.g.,[numpy.ndarray, numpy.ndarray],["/root/data/img1.jpg", "/root/data/img2.jpg"],["/root/data1", "/root/data2"].
    Python Var|str|list -device -Same as the parameter during instantiation. -str -None - - use_doc_orientation_classify Whether to use the document orientation classification module during inference. bool @@ -1378,7 +1431,7 @@ The relevant parameter descriptions are as follows: use_doc_unwarping -Whether to use the text image correction module during inference. +Whether to use the document image unwarping module during inference. bool None @@ -1535,19 +1588,19 @@ for res in visual_predict_res: Prints the result to the terminal format_json bool -Whether to format the output content using JSON indentation +Whether to format the output content using JSON indentation. True indent int -Specifies the indentation level to beautify the output JSON data for better readability, effective only when format_json is True +Specifies the indentation level to beautify the output JSON data for better readability, effective only when format_json is True. 4 ensure_ascii bool -Controls whether to escape non-ASCII characters to Unicode. Set to True to escape all non-ASCII characters; False to preserve original characters, effective only when format_json is True +Controls whether to escape non-ASCII characters to Unicode. Set to True to escape all non-ASCII characters; False to preserve original characters, effective only when format_json is True. False @@ -1561,37 +1614,37 @@ for res in visual_predict_res: indent int -Specifies the indentation level to beautify the output JSON data for better readability, effective only when format_json is True +Specifies the indentation level to beautify the output JSON data for better readability, effective only when format_json is True. 4 ensure_ascii bool -Controls whether to escape non-ASCII characters to Unicode. Set to True to escape all non-ASCII characters; False to preserve original characters, effective only when format_json is True +Controls whether to escape non-ASCII characters to Unicode. Set to True to escape all non-ASCII characters; False to preserve original characters, effective only when format_json is True. False save_to_img() -Saves the visualization images of various intermediate modules as PNG format images +Saves the visualization images of various intermediate modules as PNG format images. save_path str -Save file path, supports directory or file path +Save file path, supports directory or file path. None save_to_html() -Saves the tables in the file as HTML format files +Saves the tables in the file as HTML format files. save_path str -Save file path, supports directory or file path +Save file path, supports directory or file path. None save_to_xlsx() -Saves the tables in the file as XLSX format files +Saves the tables in the file as XLSX format files. save_path str -Save file path, supports directory or file path +Save file path, supports directory or file path. None @@ -1721,8 +1774,8 @@ The following are the parameters and their descriptions for the `build_vector()` This method returns a dictionary containing visual text information. The content of the dictionary is as follows: -- `flag_save_bytes_vector`:`(bool)` Whether to save the result as a binary file. -- `flag_too_short_text`:`(bool)` Whether the text length is less than the minimum number of characters. +- `flag_save_bytes_vector`: `(bool)` Whether to save the result as a binary file. +- `flag_too_short_text`: `(bool)` Whether the text length is less than the minimum number of characters. - `vector`: `(str|list)` Binary content of the text or the text content itself, depending on the values of `flag_save_bytes_vector` and `min_characters`. If `flag_save_bytes_vector=True` and the text length is greater than or equal to the minimum number of characters, it returns binary content; otherwise, it returns the original text. @@ -1744,8 +1797,8 @@ The following are the parameters and their descriptions for the `mllm_pred()` me input Data to be predicted, supports multiple input types, required.
      -
    • Python Var:e.g., image data represented by numpy.ndarray
    • -
    • str:e.g., local path of an image file or single-page PDF file: /root/data/img.jpgURL link, e.g., network URL of an image file or single-page PDF file: Example
    • +
    • Python Var: e.g., image data represented by numpy.ndarray;
    • +
    • str: e.g., local path of an image file or single-page PDF file: /root/data/img.jpg;URL link, e.g., network URL of an image file or single-page PDF file: Example.
    Python Var|str diff --git a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md index c0fe33b0d1e75cddcf8e6a9a9ab414aeb560b728..9005428459cde0544d15eb183db4a284191eb335 100644 --- a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md +++ b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md @@ -20,7 +20,7 @@ PP-ChatOCRv4 产线中包含版面区域检测模块表格结构识 - [表格结构识别模块](../module_usage/table_structure_recognition.md)(可选) - [文本检测模块](../module_usage/text_detection.md) - [文本识别模块](../module_usage/text_recognition.md) -- [文本行方向分类模块](../module_usage/text_line_orientation_classification.md)(可选) +- [文本行方向分类模块](../module_usage/textline_orientation_classification.md)(可选) - [公式识别模块](../module_usage/formula_recognition.md)(可选) - [印章文本检测模块](../module_usage/seal_text_detection.md)(可选) @@ -197,7 +197,7 @@ PP-ChatOCRv4 产线中包含版面区域检测模块表格结构识 103.08 / 103.08 197.99 / 197.99 6.9 M -SLANet 是百度飞桨视觉团队自研的表格结构识别模型。该模型通过采用CPU 友好型轻量级骨干网络PP-LCNet、高低层特征融合模块CSP-PAN、结构与位置信息对齐的特征解码模块SLA Head,大幅提升了表格结构识别的精度和推理速度。 +SLANet 是百度飞桨视觉团队自研的表格结构识别模型。该模型通过采用CPU 友好型轻量级骨干网络 PP-LCNet、高低层特征融合模块CSP-PAN、结构与位置信息对齐的特征解码模块 SLA Head,大幅提升了表格结构识别的精度和推理速度。 SLANet_plus推理模型/训练模型 @@ -744,7 +744,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径) -
  • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
  • - +待预测数据,必填。如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str @@ -790,330 +785,290 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + invoke_mllm -是否使用多模态大模型。 +是否加载并使用多模态大模型。如果不设置,将默认使用产线初始化的该参数值,初始化为Falsebool -False + layout_detection_model_name -用于版面区域检测的模型名称。如果设置为None,将会使用产线默认模型。 +用于版面区域检测的模型名称。如果不设置,将会使用产线默认模型。 str -None + layout_detection_model_dir -版面区域检测模型的目录路径。如果设置为None,将会下载官方模型。 +版面区域检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文档去扭曲模型的名称。如果设置为None,将会使用产线默认模型。 +文档去扭曲模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文档去扭曲模型的目录路径。如果设置为None,将会下载官方模型。 +文档去扭曲模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_detection_model_name -文本检测模型的名称。如果设置为None,将会使用产线默认模型。 +文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_detection_model_dir -文本检测模型的目录路径。如果设置为None,将会下载官方模型。 +文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_model_name -文本识别模型的名称。如果设置为None,将会使用产线默认模型。 +文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_recognition_model_dir -文本识别模型的目录路径。如果设置为None,将会下载官方模型。 +文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + table_structure_recognition_model_name -表格结构识别模型的名称。如果设置为None,将会使用产线默认模型。 +表格结构识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + table_structure_recognition_model_dir -表格结构识别模型的目录路径。如果设置为None,将会下载官方模型。 +表格结构识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_detection_model_name -印章文本检测模型的名称。如果设置为None,将会使用产线默认模型。 +印章文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + seal_text_detection_model_dir -印章文本检测模型的目录路径。如果设置为None,将会下载官方模型。 +印章文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_recognition_model_name -印章文本识别模型的名称。如果设置为None,将会使用产线默认模型。 +印章文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + seal_text_recognition_model_dir -印章文本识别模型的目录路径。如果设置为None,将会下载官方模型。 +印章文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_recognition_batch_size -印章文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +印章文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_doc_orientation_classify -是否加载文档方向分类功能。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文档去扭曲功能。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档去扭曲模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + + + +use_textline_orientation +是否加载并使用文本行方向分类模块。如果不设置,初始化为True。 +bool + use_seal_recognition -是否加载印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用印章识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_table_recognition -是否加载表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用表格识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_threshold 版面模型得分阈值。 -
      -
    • float0-1 之间的任意浮点数;
    • -
    • dict{0:0.1} key为类别ID,value为该类别的阈值;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 0.5
    • -
    +0-1 之间的任意浮点数。如果不设置,将默认使用产线初始化的该参数值,初始化为 0.5。 -float|dict -None +float + layout_nms -版面区域检测模型是否使用NMS后处理。 +版面检测是否使用后处理NMS。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_unclip_ratio -版面区域检测模型检测框的扩张系数。 -
      -
    • float:任意大于 0 浮点数;
    • -
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 1.0
    • -
    +版面区域检测模型检测框的扩张系数。任意大于 0 浮点数。如果不设置,将默认使用产线初始化的该参数值,初始化为 1.0。 -float|Tuple[float,float]|dict -None +float + layout_merge_bboxes_mode -版面区域检测的重叠框过滤方式。 +版面检测中模型输出的检测框的合并处理模式。
      -
    • strlargesmall, union,分别表示重叠框过滤时选择保留大框,小框还是同时保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 large
    • -
    +
  • large,设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框;
  • +
  • small,设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框;
  • +
  • union,不进行框的过滤处理,内外框都保留;
  • +如果不设置,将默认使用产线初始化的该参数值,初始化为large。 -str|dict -None +str + text_det_limit_side_len -文本检测的最大边长度限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • -
    +文本检测的图像边长限制。大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 960int -None + text_det_limit_type -文本检测的边长度限制类型。 -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • -
    +文本检测的边长度限制类型。支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len +如果不设置,将默认使用产线初始化的该参数值,初始化为 maxstr -None + text_det_thresh -检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +检测像素阈值。输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.3float -None + text_det_box_thresh -检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + text_det_unclip_ratio -文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 2.0float -None + text_rec_score_thresh -文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +文本识别阈值,得分大于该阈值的文本结果会被保留。大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + seal_det_limit_side_len -印章文本检测的图像边长限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • -
    +印章文本检测的图像边长限制。大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 736int -None + seal_det_limit_type -印章文本检测的图像边长限制类型。 -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • -
    +印章文本检测的图像边长限制类型。支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 minstr -None + seal_det_thresh -检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    +检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.2float -None + seal_det_box_thresh -检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。大于 0 的任意浮点数。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + seal_det_unclip_ratio -印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.5float -None + seal_rec_score_thresh -印章文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +印章文本识别阈值,得分大于该阈值的文本结果会被保留。大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + qianfan_api_key -qianfan api_key +千帆平台的api_key str -None + pp_docbee_base_url 多模态大模型服务的url。 -bool -False +str + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1121,11 +1076,10 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -1153,10 +1107,10 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1168,7 +1122,7 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 paddlex_config PaddleX产线配置文件路径。 str -None + @@ -1381,25 +1335,31 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: use_doc_orientation_classify -是否加载文档方向分类功能。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文档去扭曲功能。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档去扭曲模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +bool +None + + +use_textline_orientation +是否加载并使用文本行方向分类模块. 如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_seal_recognition -是否加载印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_table_recognition -是否加载表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None @@ -1409,7 +1369,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
    • float0-1 之间的任意浮点数;
    • dict{0:0.1} key为类别ID,value为该类别的阈值;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 0.5
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 0.5
    float|dict @@ -1417,7 +1377,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: layout_nms -版面区域检测模型是否使用NMS后处理。 +版面检测是否使用后处理NMS。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None @@ -1427,8 +1387,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
    • float:任意大于 0 浮点数;
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 1.0
    • +
    • dict,dict的key为int类型,代表cls_id,value为tuple类型,如{0: (1.1,2.0)},表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 1.0
    float|Tuple[float,float]|dict @@ -1438,9 +1398,9 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: layout_merge_bboxes_mode 版面区域检测的重叠框过滤方式。
      -
    • strlargesmall, union,分别表示重叠框过滤时选择保留大框,小框还是同时保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 large
    • +
    • strlargesmallunion,分别表示重叠框过滤时选择保留大框,小框还是同时保留;
    • +
    • dict,dict的key为int类型,代表cls_id,value为str类型,如{0: "large",2: "small"},表示对第0类别检测框使用large模式,对第2类别检测框使用small模式;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 large
    str|dict @@ -1448,10 +1408,10 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_det_limit_side_len -文本检测的最大边长度限制。 +文本检测的图像边长限制。
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 960
    int @@ -1461,8 +1421,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_det_limit_type 文本检测的边长度限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 max
    str @@ -1472,8 +1432,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.3
  • float None @@ -1482,8 +1442,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1492,8 +1452,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 2.0
  • float None @@ -1502,8 +1462,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None @@ -1512,8 +1472,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_det_limit_side_len 印章文本检测的图像边长限制。
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • +
    • int:大于0的任意整数;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 736
    int @@ -1523,8 +1483,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_det_limit_type 印章文本检测的图像边长限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 min
    str @@ -1534,8 +1494,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.2
  • float None @@ -1544,8 +1504,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1554,8 +1514,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_det_unclip_ratio 印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.5
  • float None @@ -1564,15 +1524,15 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: seal_rec_score_thresh 印章文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None retriever_config -向量检索大模型配置参数。配置内容为如下字典: +向量检索大模型配置参数。配置内容为如下dict:
    {
     "module_name": "retriever",
     "model_name": "embedding-v1",
    @@ -1586,7 +1546,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
     
     
     mllm_chat_bot_config
    -多模态大模型配置参数。配置内容为如下字典:
    +多模态大模型配置参数。配置内容为如下dict:
     
    {
     "module_name": "chat_bot",
     "model_name": "PP-DocBee",
    @@ -1600,7 +1560,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
     
     
     chat_bot_config
    -大语言模型配置信息。配置内容为如下字典:
    +大语言模型配置信息。配置内容为如下dict:
     
    {
     "module_name": "chat_bot",
     "model_name": "ernie-3.5-8k",
    @@ -1613,26 +1573,8 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
     None
     
     
    -input
    -待预测数据,支持多种输入类型,必填。
    -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    - -Python Var|str|list -None - - -save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 -str -None - - device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1640,7 +1582,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -1668,14 +1610,14 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1708,21 +1650,15 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: input 待预测数据,支持多种输入类型,必填。
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -1736,7 +1672,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: use_textline_orientation -是否在推理时使用文本行方向分类模块。 +是否加载并使用文本行方向分类模块。 bool None @@ -1887,19 +1823,19 @@ for res in visual_predict_res: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1907,43 +1843,43 @@ for res in visual_predict_res: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False save_to_img() -将中间各个模块的可视化图像保存在png格式的图像 +将中间各个模块的可视化图像保存在png格式的图像。 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 save_to_html() -将文件中的表格保存为html格式的文件 +将文件中的表格保存为html格式的文件。 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 save_to_xlsx() -将文件中的表格保存为xlsx格式的文件 +将文件中的表格保存为xlsx格式的文件。 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1960,12 +1896,12 @@ for res in visual_predict_res: - `use_table_recognition`: `(bool)` 控制是否启用表格识别子产线 - `use_formula_recognition`: `(bool)` 控制是否启用公式识别子产线 - - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。 + - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个dict,列表顺序为解析后的阅读顺序。 - `block_bbox`: `(np.ndarray)` 版面区域的边界框。 - `block_label`: `(str)` 版面区域的标签,例如`text`, `table`等。 - `block_content`: `(str)` 内容为版面区域内的内容。 - - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的字典 + - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的dict - `input_path`: `(Union[str, None])` 图像OCR子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None` - `model_settings`: `(Dict)` OCR子产线的模型配置参数 - `dt_polys`: `(List[numpy.ndarray])` 文本检测的多边形框列表。每个检测框由4个顶点坐标构成的numpy数组表示,数组shape为(4, 2),数据类型为int16 @@ -1985,12 +1921,12 @@ for res in visual_predict_res: - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤 - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys` - - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个字典 + - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个dict - `rec_formula`: `(str)` 公式识别结果 - `rec_polys`: `(numpy.ndarray)` 公式检测框,shape为(4, 2),dtype为int16 - `formula_region_id`: `(int)` 公式所在的区域编号 - - `seal_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 印章识别结果列表,每个元素为一个字典 + - `seal_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 印章识别结果列表,每个元素为一个dict - `input_path`: `(str)` 印章图像的输入路径 - `model_settings`: `(Dict)` 印章识别子产线的模型配置参数 - `dt_polys`: `(List[numpy.ndarray])` 印章检测框列表,格式同`dt_polys` @@ -2002,7 +1938,7 @@ for res in visual_predict_res: - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的印章检测框列表,格式同`dt_polys` - `rec_boxes`: `(numpy.ndarray)` 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个矩形 - - `table_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 表格识别结果列表,每个元素为一个字典 + - `table_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 表格识别结果列表,每个元素为一个dict - `cell_box_list`: `(List[numpy.ndarray])` 表格单元格的边界框列表 - `pred_html`: `(str)` 表格的HTML格式字符串 - `table_ocr_pred`: `(dict)` 表格的OCR识别结果 @@ -2024,16 +1960,16 @@ for res in visual_predict_res: json -获取预测的 json 格式的结果 +获取预测的 json 格式的结果。 img -获取格式为 dict 的可视化图像 +获取格式为 dict 的可视化图像。 - `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `layout_det_res`、`overall_ocr_res`、`text_paragraphs_ocr_res`、`formula_res_region1`、`table_cell_img` 和 `seal_res_region1`,对应的值是 `Image.Image` 对象:分别用于显示版面区域检测、OCR、OCR文本段落、公式、表格和印章结果的可视化图像。如果没有使用可选模块,则字典中只包含 `layout_det_res`。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `layout_det_res`、`overall_ocr_res`、`text_paragraphs_ocr_res`、`formula_res_region1`、`table_cell_img` 和 `seal_res_region1`,对应的值是 `Image.Image` 对象:分别用于显示版面区域检测、OCR、OCR文本段落、公式、表格和印章结果的可视化图像。如果没有使用可选模块,则dict中只包含 `layout_det_res`。
    (4)调用PP-ChatOCRv4的产线对象的 build_vector() 方法,对文本内容进行向量构建。 @@ -2052,25 +1988,25 @@ for res in visual_predict_res: visual_info -视觉信息,可以是包含视觉信息的字典,或者由这些字典组成的列表 +视觉信息,可以是包含视觉信息的dict,或者由这些dict组成的列表。 list|dict None min_characters -最小字符数量。为大于0的正整数,可以根据大语言模型支持的token长度来决定 +最小字符数量。为大于0的正整数,可以根据大语言模型支持的token长度来决定。 int 3500 block_size -长文本建立向量库时分块大小。为大于0的正整数,可以根据大语言模型支持的token长度来决定 +长文本建立向量库时分块大小。为大于0的正整数,可以根据大语言模型支持的token长度来决定。 int 300 flag_save_bytes_vector -文字是否保存为二进制文件 +文字是否保存为二进制文件。 bool False @@ -2081,7 +2017,7 @@ for res in visual_predict_res: None -该方法会返回一个包含视觉文本信息的字典,字典的内容如下: +该方法会返回一个包含视觉文本信息的dict,dict的内容如下: - `flag_save_bytes_vector`:`(bool)`是否将结果保存为二进制文件 - `flag_too_short_text`:`(bool)`是否文本长度小于最小字符数量 @@ -2106,8 +2042,8 @@ for res in visual_predict_res: input 待预测数据,支持多种输入类型,必填。
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者单页PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或单页PDF文件的网络URL:示例
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者单页PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或单页PDF文件的网络URL:示例
    Python Var|str @@ -2115,7 +2051,7 @@ for res in visual_predict_res: key_list -用于提取信息的单个键或键列表 +用于提取信息的单个键或键列表。 Union[str, List[str]] None @@ -2146,25 +2082,25 @@ for res in visual_predict_res: key_list -用于提取信息的单个键或键列表 +用于提取信息的单个键或键列表。 Union[str, List[str]] None visual_info -视觉信息结果 +视觉信息结果。 List[dict] None use_vector_retrieval -是否使用向量检索 +是否使用向量检索。 bool True vector_info -用于检索的向量信息 +用于检索的向量信息。 dict None @@ -2176,67 +2112,67 @@ for res in visual_predict_res: text_task_description -文本任务的描述 +文本任务的描述。 str None text_output_format -文本结果的输出格式 +文本结果的输出格式。 str None text_rules_str -生成文本结果的规则 +生成文本结果的规则。 str None text_few_shot_demo_text_content -用于少样本演示的文本内容 +用于少样本演示的文本内容。 str None text_few_shot_demo_key_value_list -用于少样本演示的键值列表 +用于少样本演示的键值列表。/td> str None table_task_description -表任务的描述 +表任务的描述。 str None table_output_format -表结果的输出格式 +表结果的输出格式。 str None table_rules_str -生成表结果的规则 +生成表结果的规则。 str None table_few_shot_demo_text_content -表少样本演示的文本内容 +表少样本演示的文本内容。 str None table_few_shot_demo_key_value_list -表少样本演示的键值列表 +表少样本演示的键值列表。 str None mllm_predict_info -多模态大模型结果 +多模态大模型结果。 dict None @@ -2244,7 +2180,7 @@ for res in visual_predict_res: None mllm_integration_strategy -多模态大模型和大语言模型数据融合策略,支持单独使用其中一个或者融合两者结果。可选:"integration", "llm_only" and "mllm_only" +多模态大模型和大语言模型数据融合策略,支持单独使用其中一个或者融合两者结果。可选:"integration", "llm_only" and "mllm_only"。 str "integration" @@ -2264,7 +2200,7 @@ for res in visual_predict_res: 该方法会将结果打印到终端,打印到终端的内容解释如下: - - `chat_res`: `(dict)` 提取信息的结果,是一个字典,包含了待抽取的键和对应的值。 + - `chat_res`: `(dict)` 提取信息的结果,是一个dict,包含了待抽取的键和对应的值。
    @@ -3046,7 +2982,7 @@ paddleocr pp_chatocrv4_doc --paddlex_config PP-ChatOCRv4.yaml ... 4. 在 Python API 中加载产线配置文件 -初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置字典,PaddleOCR 会读取其中的内容作为产线配置。示例如下: +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: ```python from paddleocr import PPChatOCRv4 diff --git a/docs/version3.x/pipeline_usage/PP-StructureV3.en.md b/docs/version3.x/pipeline_usage/PP-StructureV3.en.md index 303595eade2c6d7e4fd9dfbb0e44cef1d49ddd98..63b349235606e0496426d2836eaacc34b8e401c0 100644 --- a/docs/version3.x/pipeline_usage/PP-StructureV3.en.md +++ b/docs/version3.x/pipeline_usage/PP-StructureV3.en.md @@ -2,13 +2,13 @@ comments: true --- -# PP-StructureV3 Production Line User Guide +# PP-StructureV3 Pipeline Usage Tutorial -## 1. Introduction to PP-StructureV3 Production Line +## 1. Introduction to PP-StructureV3 Pipeline -Layout analysis is a technique used to extract structured information from document images. It is primarily used to convert complex document layouts into machine-readable data formats. This technology has broad applications in document management, information extraction, and data digitization. Layout analysis combines Optical Character Recognition (OCR), image processing, and machine learning algorithms to identify and extract text blocks, titles, paragraphs, images, tables, and other layout elements from documents. This process generally includes three main steps: layout analysis, element analysis, and data formatting. The final result is structured document data, which enhances the efficiency and accuracy of data processing. PP-StructureV3 improves upon the general layout analysis v1 production line by enhancing layout region detection, table recognition, and formula recognition. It also adds capabilities such as multi-column reading order recovery and result conversion to Markdown files. It performs excellently across various document types and can handle complex document data. This production line also provides flexible service deployment options, supporting invocation using multiple programming languages on various hardware. In addition, it offers secondary development capabilities, allowing you to train and fine-tune models on your own dataset and integrate the trained models seamlessly. +Layout analysis is a technique used to extract structured information from document images. It is primarily used to convert complex document layouts into machine-readable data formats. This technology has broad applications in document management, information extraction, and data digitization. Layout analysis combines Optical Character Recognition (OCR), image processing, and machine learning algorithms to identify and extract text blocks, titles, paragraphs, images, tables, and other layout elements from documents. This process generally includes three main steps: layout analysis, element analysis, and data formatting. The final result is structured document data, which enhances the efficiency and accuracy of data processing. PP-StructureV3 improves upon the general layout analysis v1 pipeline by enhancing layout region detection, table recognition, and formula recognition. It also adds capabilities such as multi-column reading order recovery, chart understanding, and result conversion to Markdown files. It performs excellently across various document types and can handle complex document data. This pipeline also provides flexible service deployment options, supporting invocation using multiple programming languages on various hardware. In addition, it offers secondary development capabilities, allowing you to train and fine-tune models on your own dataset and integrate the trained models seamlessly. -PP-StructureV3 includes the following six modules. Each module can be independently trained and inferred, and contains multiple models. Click the corresponding module for more documentation. +PP-StructureV3 includes the following six modules. Each module can be independently trained and inferred, and contains multiple models. Click the corresponding module for more documentation. - [Layout Detection Module](../module_usage/layout_detection.en.md) - [General OCR Subline](./OCR.en.md) @@ -16,7 +16,6 @@ Layout analysis is a technique used to extract structured information from docum - [Table Recognition Subline ](./table_recognition_v2.en.md) (Optional) - [Seal Recognition Subline](./seal_recognition.en.md) (Optional) - [Formula Recognition Subline](./formula_recognition.en.md) (Optional) -- [Chart Parsing Module ]() (Optional) In this pipeline, you can choose the model to use based on the benchmark data below. @@ -774,7 +773,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/DocUNet @@ -814,7 +813,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Command line supports more parameters. Click to expand for detailed parameter descriptions @@ -964,476 +965,426 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu - - + - + - + - + - + - + - + - - - + + - + - + - - - + + - - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - + - + - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + + + + + + + - + - + - + - + - + - + - + - + - + - + - - + @@ -1461,9 +1412,9 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu - + - + @@ -1475,7 +1426,7 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu - +
    inputData to be predicted. Required. Supports multiple input types. -
      -
    • Python Var: e.g., numpy.ndarray representing image data
    • -
    • str: e.g., local path to image or PDF file: /root/data/img.jpg; URL, e.g., online image or PDF: example; local directory: directory containing images to predict, e.g., /root/data/ (currently, directories with PDFs are not supported; PDFs must be specified by file path)
    • -
    • List: list elements must be one of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
    • -
    +
    Data to be predicted. Required. +.e.g., local path to image or PDF file: /root/data/img.jpg; URL, e.g., online image or PDF: example; local directory: directory containing images to predict, e.g., /root/data/ (currently, directories with PDFs are not supported; PDFs must be specified by file path). Python Var|str|liststr
    save_pathPath to save inference results. If set to None, results will not be saved locally.Path to save inference results. If not set, results will not be saved locally. strNone
    layout_detection_model_nameName of the layout detection model. If set to None, the default model will be used.Name of the layout detection model. If not set, the default model will be used. strNone
    layout_detection_model_dirDirectory path of the layout detection model. If set to None, the official model will be downloaded.Directory path of the layout detection model. If not set, the official model will be downloaded. strNone
    layout_thresholdScore threshold for the layout model. -
      -
    • float: any value between 0-1
    • -
    • dict: {0:0.1}, where key is class ID and value is the threshold for that class
    • -
    • None: if set to None, the default value is used, which is 0.5
    • -
    +
    Score threshold for the layout model. Any value between 0-1. If not set, the default value is used, which is 0.5. float|dictNonefloat
    layout_nmsWhether to apply NMS post-processing for layout detection model.Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If not set, the parameter will default to the value initialized in the pipeline, which is set to True by default. boolNone
    layout_unclip_ratioUnclip ratio for detected boxes in layout detection model. -
      -
    • float: any float > 0
    • -
    • Tuple[float,float]: separate ratios for width and height
    • -
    • dict: key is int (class ID), value is tuple, e.g., {0: (1.1, 2.0)} means class 0 boxes will be expanded 1.1x in width, 2.0x in height
    • -
    • None: if set to None, default is 1.0
    • -
    -
    float|Tuple[float,float]|dictNoneUnclip ratio for detected boxes in layout detection model. Any float > 0. If not set, the default is 1.0. +float
    layout_merge_bboxes_modeMerge mode for overlapping boxes in layout detection. +The merging mode for the detection boxes output by the model in layout region detection.
      -
    • str: large, small, union, for keeping larger box, smaller box, or both
    • -
    • dict: key is int (class ID), value is str, e.g., {0: "large", 2: "small"}
    • -
    • None: if set to None, default is large
    • -
    +
  • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed;
  • +
  • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed;
  • +
  • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained;
  • +If not set, the default is large.
    str|dictNonestr
    chart_recognition_model_nameName of the chart recognition model. If set to None, the default model will be used.Name of the chart recognition model. If not set, the default model will be used. strNone
    chart_recognition_model_dirDirectory path of the chart recognition model. If set to None, the official model will be downloaded.Directory path of the chart recognition model. If not set, the official model will be downloaded. strNone
    chart_recognition_batch_sizeBatch size for the chart recognition model. If set to None, the default batch size is 1.Batch size for the chart recognition model. If not set, the default batch size is 1. intNone
    region_detection_model_nameName of the region detection model. If set to None, the default model will be used.Name of the region detection model. If not set, the default model will be used. strNone
    region_detection_model_dirDirectory path of the region detection model. If set to None, the official model will be downloaded.Directory path of the region detection model. If not set, the official model will be downloaded. strNone
    doc_orientation_classify_model_nameName of the document orientation classification model. If set to None, the default model will be used.Name of the document orientation classification model. If not set, the default model will be used. strNone
    doc_orientation_classify_model_dirDirectory path of the document orientation classification model. If set to None, the official model will be downloaded.Directory path of the document orientation classification model. If not set, the official model will be downloaded. strNone
    doc_unwarping_model_nameName of the document unwarping model. If set to None, the default model will be used.Name of the document unwarping model. If not set, the default model will be used. strNone
    doc_unwarping_model_dirDirectory path of the document unwarping model. If set to None, the official model will be downloaded.Directory path of the document unwarping model. If not set, the official model will be downloaded. strNone
    text_detection_model_nameName of the text detection model. If set to None, the default model will be used.Name of the text detection model. If not set, the default model will be used. strNone
    text_detection_model_dirDirectory path of the text detection model. If set to None, the official model will be downloaded.Directory path of the text detection model. If not set, the official model will be downloaded. strNone
    text_det_limit_side_lenMaximum side length limit for text detection. -
      -
    • int: any integer > 0;
    • -
    • None: if set to None, the default value will be 960;
    • -
    +
    Image side length limitation for text detection. Any integer > 0. If not set, the default value will be 960. intNone
    text_det_limit_type -
      -
    • str: supports min and max; min means ensuring the shortest side of the image is not less than det_limit_side_len, max means the longest side does not exceed limit_side_len
    • -
    • None: if set to None, the default value will be max.
    • -
    +
    Type of the image side length limit for text detection. +supports min and max; min means ensuring the shortest side of the image is not less than det_limit_side_len, max means the longest side does not exceed limit_side_len. If not set, the default value will be max. strNone
    text_det_threshPixel threshold for detection. Pixels with scores above this value in the probability map are considered text. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.3
    • -
    +
    Pixel threshold for detection. Pixels with scores above this value in the probability map are considered text.Any float > 0 +. If not set, the default is 0.3. floatNone
    text_det_box_thresh Box threshold. A bounding box is considered text if the average score of pixels inside is greater than this value. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.6
    • -
    +Any float > 0. If not set, the default is 0.6.
    floatNone
    text_det_unclip_ratio Expansion ratio for text detection. The higher the value, the larger the expansion area. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 2.0
    • -
    +any float > 0. If not set, the default is 2.0.
    floatNone
    textline_orientation_model_nameName of the text line orientation model. If set to None, the default model will be used.Name of the text line orientation model. If not set, the default model will be used. strNone
    textline_orientation_model_dirDirectory of the text line orientation model. If set to None, the official model will be downloaded.Directory of the text line orientation model. If not set, the official model will be downloaded. strNone
    textline_orientation_batch_sizeBatch size for the text line orientation model. If set to None, default is 1.Batch size for the text line orientation model. If not set, the default is 1. intNone
    text_recognition_model_nameName of the text recognition model. If set to None, the default model will be used.Name of the text recognition model. If not set, the default model will be used. strNone
    text_recognition_model_dirDirectory of the text recognition model. If set to None, the official model will be downloaded.Directory of the text recognition model. If not set, the official model will be downloaded. strNone
    text_recognition_batch_sizeBatch size for text recognition. If set to None, default is 1.Batch size for text recognition. If not set, the default is 1. intNone
    text_rec_score_thresh Score threshold for text recognition. Only results above this value will be kept. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.0 (no threshold)
    • -
    +Any float > 0. If not set, the default is 0.0 (no threshold).
    floatNone
    table_classification_model_nameName of the table classification model. If set to None, the default model will be used.Name of the table classification model. If not set, the default model will be used. strNone
    table_classification_model_dirDirectory of the table classification model. If set to None, the official model will be downloaded.Directory of the table classification model. If not set, the official model will be downloaded. strNone
    wired_table_structure_recognition_model_nameName of the wired table structure recognition model. If set to None, the default model will be used.Name of the wired table structure recognition model. If not set, the default model will be used. strNone
    wired_table_structure_recognition_model_dirDirectory of the wired table structure recognition model. If set to None, the official model will be downloaded.Directory of the wired table structure recognition model. If not set, the official model will be downloaded. strNone
    wireless_table_structure_recognition_model_nameName of the wireless table structure recognition model. If set to None, the default model will be used.Name of the wireless table structure recognition model. If not set, the default model will be used. strNone
    wireless_table_structure_recognition_model_dirDirectory of the wireless table structure recognition model. If set to None, the official model will be downloaded.Directory of the wireless table structure recognition model. If not set, the official model will be downloaded. strNone
    wired_table_cells_detection_model_nameName of the wired table cell detection model. If set to None, the default model will be used.Name of the wired table cell detection model. If not set, the default model will be used. strNone
    wired_table_cells_detection_model_dirDirectory of the wired table cell detection model. If set to None, the official model will be downloaded.Directory of the wired table cell detection model. If not set, the official model will be downloaded. strNone
    wireless_table_cells_detection_model_nameName of the wireless table cell detection model. If set to None, the default model will be used.Name of the wireless table cell detection model. If not set, the default model will be used. strNone
    wireless_table_cells_detection_model_dirDirectory of the wireless table cell detection model. If set to None, the official model will be downloaded.Directory of the wireless table cell detection model. If not set, the official model will be downloaded. strNone
    seal_text_detection_model_nameName of the seal text detection model. If set to None, the default model will be used.Name of the seal text detection model. If not set, the default model will be used. strNone
    seal_text_detection_model_dirDirectory of the seal text detection model. If set to None, the official model will be downloaded.Directory of the seal text detection model. If not set, the official model will be downloaded. strNone
    seal_det_limit_side_len Image side length limit for seal text detection. -
      -
    • int: any integer > 0;
    • -
    • None: if set to None, the default is 736;
    • -
    +Any integer > 0. If not set, the default is 736.
    intNone
    seal_det_limit_type Limit type for image side in seal text detection. -
      -
    • str: supports min and max; min ensures shortest side ≥ det_limit_side_len, max ensures longest side ≤ limit_side_len
    • -
    • None: if set to None, default is min;
    • -
    +supports min and max; min ensures shortest side ≥ det_limit_side_len, max ensures longest side ≤ limit_side_len. If not set, the default is min.
    strNone
    seal_det_thresh Pixel threshold. Pixels with scores above this value in the probability map are considered text. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.2
    • -
    +any float > 0. If not set, the default is 0.2.
    floatNone
    seal_det_box_thresh Box threshold. Boxes with average pixel scores above this value are considered text regions. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.6
    • -
    +any float > 0. If not set, the default is 0.6.
    floatNone
    seal_det_unclip_ratioExpansion ratio for seal text detection. Higher value means larger expansion area. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.5
    • -
    +
    Expansion ratio for seal text detection. Higher value means larger expansion area.Any float > 0. If not set, the default is 0.5. floatNone
    seal_text_recognition_model_nameName of the seal text recognition model. If set to None, the default model will be used.Name of the seal text recognition model. If not set, the default model will be used. strNone
    seal_text_recognition_model_dirDirectory of the seal text recognition model. If set to None, the official model will be downloaded.Directory of the seal text recognition model. If not set, the official model will be downloaded. strNone
    seal_text_recognition_batch_sizeBatch size for seal text recognition. If set to None, default is 1.Batch size for seal text recognition. If not set, the default is 1. intNone
    seal_rec_score_threshRecognition score threshold. Text results above this value will be kept. -
      -
    • float: any float > 0
    • -
    • None: if set to None, default is 0.0 (no threshold)
    • -
    +
    Recognition score threshold. Text results above this value will be kept. Any float > 0. If not set, the default is 0.0 (no threshold). floatNone
    formula_recognition_model_nameName of the formula recognition model. If set to None, the default model will be used.Name of the formula recognition model. If not set, the default model will be used. strNone
    formula_recognition_model_dirDirectory of the formula recognition model. If set to None, the official model will be downloaded.Directory of the formula recognition model. If not set, the official model will be downloaded. strNone
    formula_recognition_batch_sizeBatch size of the formula recognition model. If set to None, the default is 1.Batch size of the formula recognition model. If not set, the default is 1. intNone
    use_doc_orientation_classifyWhether to enable document orientation classification. If set to None, default is True.Whether to load and use document orientation classification module. If not set, the default is True. boolNone
    use_doc_unwarpingWhether to enable document unwarping. If set to None, default is True.Whether to load and use document unwarping module. If not set, the default is True. boolNone
    use_textline_orientationWhether to load and use the text line orientation classification module. If not set, the default is True.bool
    use_seal_recognitionWhether to enable seal recognition subpipeline. If set to None, default is True.Whether to load and use seal recognition subpipeline. If not set, the default is True. boolNone
    use_table_recognitionWhether to enable table recognition subpipeline. If set to None, default is True.Whether to load and use table recognition subpipeline. If not set, the default is True. boolNone
    use_formula_recognitionWhether to enable formula recognition subpipeline. If set to None, default is True.Whether to load and use formula recognition subpipeline. If not set, the default is True. boolNone
    use_chart_recognitionWhether to enable chart recognition model. If set to None, default is True.Whether to load and use the chart recognition sub-pipeline. If not set, the default is True. boolNone
    use_region_detectionWhether to enable region detection submodule for document images. If set to None, default is True.Whether to load and use the document region detection pipeline. If not set, the default is True. boolNone
    deviceDevice for inference. You can specify a device ID. +Device for inference. You can specify a device ID:
      -
    • CPU: e.g., cpu
    • +
    • CPU: e.g., cpu means using CPU for inference;
    • GPU: e.g., gpu:0 means GPU 0
    • NPU: e.g., npu:0 means NPU 0
    • XPU: e.g., xpu:0 means XPU 0
    • MLU: e.g., mlu:0 means MLU 0
    • DCU: e.g., dcu:0 means DCU 0
    • -
    • None: If set to None, GPU 0 will be used by default if available; otherwise, CPU will be used.
    • -
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    strNone
    enable_hpi
    enable_mkldnnWhether to enable MKL-DNN. If set to None, enabled by default.Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. boolNoneTrue
    cpu_threadspaddlex_config Path to the PaddleX pipeline configuration file. strNone
    @@ -1606,7 +1557,7 @@ The above Python script performs the following steps:
    • float: Any float between 0-1;
    • dict: {0:0.1} where the key is the class ID and the value is the threshold for that class;
    • -
    • None: If set to None, uses the pipeline default of 0.5;
    • +
    • None: If set to None, uses the pipeline default of 0.5.
    float|dict @@ -1614,7 +1565,7 @@ The above Python script performs the following steps: layout_nms -Whether to use NMS post-processing for the layout detection model. +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If set to None, the parameter will default to the value initialized in the pipeline, which is set to True by default. bool None @@ -1625,7 +1576,7 @@ The above Python script performs the following steps:
  • float: Any float greater than 0;
  • Tuple[float,float]: Expansion ratios in horizontal and vertical directions;
  • dict: A dictionary with int keys representing cls_id, and tuple values, e.g., {0: (1.1, 2.0)} means width is expanded 1.1× and height 2.0× for class 0 boxes;
  • -
  • None: If set to None, uses the pipeline default of 1.0;
  • +
  • None: If set to None, uses the pipeline default of 1.0.
  • float|Tuple[float,float]|dict @@ -1637,7 +1588,7 @@ The above Python script performs the following steps:
    • str: Options include large, small, and union to retain the larger box, smaller box, or both;
    • dict: A dictionary with int keys representing cls_id, and str values, e.g., {0: "large", 2: "small"} means using different modes for different classes;
    • -
    • None: If set to None, uses the pipeline default value large;
    • +
    • None: If set to None, uses the pipeline default value large.
    str|dict @@ -1711,10 +1662,10 @@ The above Python script performs the following steps: text_det_limit_side_len -Maximum side length limit for text detection. +Image side length limitation for text detection.
    • int: Any integer greater than 0;
    • -
    • None: If set to None, uses the pipeline default of 960;
    • +
    • None: If set to None, uses the pipeline default of 960.
    int @@ -1725,7 +1676,7 @@ The above Python script performs the following steps:
    • str: Supports min and max. min ensures the shortest side is no less than det_limit_side_len, while max ensures the longest side is no greater than limit_side_len;
    • -
    • None: If set to None, uses the pipeline default of max;
    • +
    • None: If set to None, uses the pipeline default of max.
    str @@ -1736,7 +1687,7 @@ The above Python script performs the following steps: Pixel threshold for detection. Pixels in the output probability map with scores above this value are considered as text pixels.
    • float: Any float greater than 0;
    • -
    • None: If set to None, uses the pipeline default value of 0.3;
    • +
    • None: If set to None, uses the pipeline default value of 0.3.
    float @@ -1747,7 +1698,7 @@ The above Python script performs the following steps: Bounding box threshold. If the average score of all pixels inside the box exceeds this threshold, it is considered a text region.
    • float: Any float greater than 0;
    • -
    • None: If set to None, uses the pipeline default value of 0.6;
    • +
    • None: If set to None, uses the pipeline default value of 0.6.
    float @@ -1758,7 +1709,7 @@ The above Python script performs the following steps: Expansion ratio for text detection. The larger the value, the more the text region is expanded.
    • float: Any float greater than 0;
    • -
    • None: If set to None, uses the pipeline default value of 2.0;
    • +
    • None: If set to None, uses the pipeline default value of 2.0.
    float @@ -1805,7 +1756,7 @@ The above Python script performs the following steps: Score threshold for text recognition. Only results with scores above this threshold will be retained.
    • float: Any float greater than 0;
    • -
    • None: If set to None, uses the pipeline default of 0.0 (no threshold);
    • +
    • None: If set to None, uses the pipeline default of 0.0 (no threshold).
    float @@ -1889,7 +1840,7 @@ The above Python script performs the following steps: Image side length limit for seal text detection.
    • int: Any integer greater than 0;
    • -
    • None: If set to None, the default value is 736;
    • +
    • None: If set to None, the default value is 736.
    int @@ -1900,7 +1851,7 @@ The above Python script performs the following steps: Limit type for seal text detection image side length.
    • str: Supports min and max. min ensures the shortest side is no less than det_limit_side_len, while max ensures the longest side is no greater than limit_side_len;
    • -
    • None: If set to None, the default value is min;
    • +
    • None: If set to None, the default value is min.
    str @@ -1911,7 +1862,7 @@ The above Python script performs the following steps: Pixel threshold for detection. Pixels with scores greater than this value in the probability map are considered text pixels.
    • float: Any float greater than 0;
    • -
    • None: If set to None, the default value is 0.2;
    • +
    • None: If set to None, the default value is 0.2.
    float @@ -1922,7 +1873,7 @@ The above Python script performs the following steps: Bounding box threshold. If the average score of all pixels inside a detection box exceeds this threshold, it is considered a text region.
    • float: Any float greater than 0;
    • -
    • None: If set to None, the default value is 0.6;
    • +
    • None: If set to None, the default value is 0.6.
    float @@ -1933,7 +1884,7 @@ The above Python script performs the following steps: Expansion ratio for seal text detection. The larger the value, the larger the expanded area.
    • float: Any float greater than 0;
    • -
    • None: If set to None, the default value is 0.5;
    • +
    • None: If set to None, the default value is 0.5.
    float @@ -1962,7 +1913,7 @@ The above Python script performs the following steps: Score threshold for seal text recognition. Text results with scores above this threshold will be retained.
    • float: Any float greater than 0;
    • -
    • None: If set to None, the default value is 0.0 (no threshold);
    • +
    • None: If set to None, the default value is 0.0 (no threshold).
    float @@ -1999,20 +1950,44 @@ The above Python script performs the following steps: None +use_textline_orientation +Whether to use the text line orientation classification. If not set, the default is True. +bool + + + +use_seal_recognition +Whether to enable seal recognition subpipeline. If not set, the default is True. +bool + + + +use_table_recognition +Whether to enable table recognition subpipeline. If not set, the default is True. +bool + + + +use_formula_recognition +Whether to enable formula recognition subpipeline. If not set, the default is True. +bool + + + use_chart_recognition -Whether to enable the chart recognition model. If set to None, the default value is True. +Whether to use the chart recognition sub-pipeline. If set to None, the default value is True. bool None use_region_detection -Whether to enable the region detection model for document layout. If set to None, the default value is True. +Whether to use the document region detection pipeline. If set to None, the default value is True. bool None device -Device used for inference. Supports specifying device ID. +Device used for inference. Supports specifying device ID:
    • CPU: e.g., cpu means using CPU for inference;
    • GPU: e.g., gpu:0 means using GPU 0;
    • @@ -2020,7 +1995,7 @@ The above Python script performs the following steps:
    • XPU: e.g., xpu:0 means using XPU 0;
    • MLU: e.g., mlu:0 means using MLU 0;
    • DCU: e.g., dcu:0 means using DCU 0;
    • -
    • None: If set to None, GPU 0 will be used by default. If GPU is not available, CPU will be used;
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str @@ -2048,13 +2023,13 @@ The above Python script performs the following steps: precision Computation precision, e.g., fp32, fp16. str -fp32 +"fp32" enable_mkldnn -Whether to enable MKL-DNN acceleration. If set to None, MKL-DNN is enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -2088,21 +2063,15 @@ The above Python script performs the following steps: input Input data to be predicted. Required. Supports multiple types:
      -
    • Python Var: Image data represented as numpy.ndarray
    • -
    • str: Local path to image or PDF file, e.g., /root/data/img.jpg; URL to image or PDF, e.g., example; directory containing image files, e.g., /root/data/ (directories with PDFs are not supported, use full file path for PDFs)
    • -
    • List: Elements can be any of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"]
    • +
    • Python Var: Image data represented as numpy.ndarray;
    • +
    • str: Local path to image or PDF file, e.g., /root/data/img.jpg; URL to image or PDF, e.g., example; directory containing image files, e.g., /root/data/ (directories with PDFs are not supported, use full file path for PDFs);
    • +
    • List: Elements can be any of the above types, e.g., [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
    Python Var|str|list -device -Same as the parameter used during initialization. -str -None - - use_doc_orientation_classify Whether to use document orientation classification during inference. bool @@ -2139,6 +2108,18 @@ The above Python script performs the following steps: None +use_chart_recognition +Whether to use the chart recognition sub-pipeline. If set to None, the default value is True. +bool +None + + +use_region_detection +Whether to use the document region detection pipeline. If set to None, the default value is True. +bool +None + + layout_threshold Same as the parameter used during initialization. float|dict @@ -2234,6 +2215,66 @@ The above Python script performs the following steps: float None + +use_wired_table_cells_trans_to_html +Whether to enable direct conversion of wired table cell detection results to HTML. Default is False. If enabled, HTML will be constructed directly based on the geometric relationship of wired table cell detection results. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as False;
    • +
    +bool|None +False + + +use_wireless_table_cells_trans_to_html +Whether to enable direct conversion of wireless table cell detection results to HTML. Default is False. If enabled, HTML will be constructed directly based on the geometric relationship of wireless table cell detection results. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as False;
    • +
    +bool|None +False + + +use_table_orientation_classify +Whether to enable table orientation classification. When enabled, it can correct the orientation and correctly complete table recognition if the table in the image is rotated by 90/180/270 degrees. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as True;
    • +
    +bool|None +True + + +use_ocr_results_with_table_cells +Whether to enable OCR within cell segmentation. When enabled, OCR detection results will be segmented and re-recognized based on cell prediction results to avoid text loss. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as True;
    • +
    +bool|None +True + + +use_e2e_wired_table_rec_model +Whether to enable end-to-end wired table recognition mode. If enabled, the cell detection model will not be used, and only the table structure recognition model will be used. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as False;
    • +
    +bool|None +False + + +use_e2e_wireless_table_rec_model +Whether to enable end-to-end wireless table recognition mode. If enabled, the cell detection model will not be used, and only the table structure recognition model will be used. +
      +
    • bool: True or False;
    • +
    • None: If set to None, it will default to the initialized parameter value, initialized as False;
    • +
    +bool|None +True + @@ -2255,19 +2296,19 @@ The above Python script performs the following steps: Print result to terminal format_json bool -Whether to format output as indented JSON +Whether to format output as indented JSON. True indent int -Indentation level to beautify the JSON output. Only effective when format_json=True +Indentation level to beautify the JSON output. Only effective when format_json=True. 4 ensure_ascii bool -Whether to escape non-ASCII characters to Unicode. When True, all non-ASCII characters are escaped. When False, original characters are retained. Only effective when format_json=True +Whether to escape non-ASCII characters to Unicode. When True, all non-ASCII characters are escaped. When False, original characters are retained. Only effective when format_json=True. False @@ -2275,19 +2316,19 @@ The above Python script performs the following steps: Save result as a JSON file save_path str -Path to save the file. If a directory, the filename will be based on the input type +Path to save the file. If a directory, the filename will be based on the input type. None indent int -Indentation level for beautified JSON output. Only effective when format_json=True +Indentation level for beautified JSON output. Only effective when format_json=True. 4 ensure_ascii bool -Whether to escape non-ASCII characters to Unicode. Only effective when format_json=True +Whether to escape non-ASCII characters to Unicode. Only effective when format_json=True. False @@ -2295,7 +2336,7 @@ The above Python script performs the following steps: Save intermediate visualization results as PNG image files save_path str -Path to save the file, supports directory or file path +Path to save the file, supports directory or file path. None @@ -2303,7 +2344,7 @@ The above Python script performs the following steps: Save each page of an image or PDF file as a markdown file save_path str -Path to save the file, supports directory or file path +Path to save the file, supports directory or file path. None @@ -2311,7 +2352,7 @@ The above Python script performs the following steps: Save tables in the file as HTML format save_path str -Path to save the file, supports directory or file path +Path to save the file, supports directory or file path. None @@ -2319,7 +2360,7 @@ The above Python script performs the following steps: Save tables in the file as XLSX format save_path str -Path to save the file, supports directory or file path +Path to save the file, supports directory or file path. None @@ -2327,8 +2368,8 @@ The above Python script performs the following steps: Concatenate multiple markdown pages into a single document markdown_list list -List of markdown data for each page -Returns the merged markdown text and image list +List of markdown data for each page. +Returns the merged markdown text and image list. diff --git a/docs/version3.x/pipeline_usage/PP-StructureV3.md b/docs/version3.x/pipeline_usage/PP-StructureV3.md index 6c147ab81b2c27b91954b8c9758d3c0704e3e147..a8ad89e59edc14385e0ac26c3e57b666ac9c31f8 100644 --- a/docs/version3.x/pipeline_usage/PP-StructureV3.md +++ b/docs/version3.x/pipeline_usage/PP-StructureV3.md @@ -6,7 +6,7 @@ comments: true ## 1. PP-StructureV3 产线介绍 -版面解析是一种从文档图像中提取结构化信息的技术,主要用于将复杂的文档版面转换为机器可读的数据格式。这项技术在文档管理、信息提取和数据数字化等领域具有广泛的应用。版面解析通过结合光学字符识别(OCR)、图像处理和机器学习算法,能够识别和提取文档中的文本块、标题、段落、图片、表格以及其他版面元素。此过程通常包括版面分析、元素分析和数据格式化三个主要步骤,最终生成结构化的文档数据,提升数据处理的效率和准确性。PP-StructureV3 产线在通用版面解析v1产线的基础上,强化了版面区域检测、表格识别、公式识别的能力,增加了多栏阅读顺序的恢复能力、结果转换 Markdown 文件的能力,在多种文档数据中,表现优异,可以处理较复杂的文档数据。本产线同时提供了灵活的服务化部署方式,支持在多种硬件上使用多种编程语言调用。不仅如此,本产线也提供了二次开发的能力,您可以基于本产线在您自己的数据集上训练调优,训练后的模型也可以无缝集成。 +版面解析是一种从文档图像中提取结构化信息的技术,主要用于将复杂的文档版面转换为机器可读的数据格式。这项技术在文档管理、信息提取和数据数字化等领域具有广泛的应用。版面解析通过结合光学字符识别(OCR)、图像处理和机器学习算法,能够识别和提取文档中的文本块、标题、段落、图片、表格以及其他版面元素。此过程通常包括版面分析、元素分析和数据格式化三个主要步骤,最终生成结构化的文档数据,提升数据处理的效率和准确性。PP-StructureV3 产线在通用版面解析v1产线的基础上,强化了版面区域检测、表格识别、公式识别的能力,增加了图表理解能力和多栏阅读顺序的恢复能力、结果转换 Markdown 文件的能力,在多种文档数据中,表现优异,可以处理较复杂的文档数据。本产线同时提供了灵活的服务化部署方式,支持在多种硬件上使用多种编程语言调用。不仅如此,本产线也提供了二次开发的能力,您可以基于本产线在您自己的数据集上训练调优,训练后的模型也可以无缝集成。 PP-StructureV3 产线中包含以下6个模块。每个模块均可独立进行训练和推理,并包含多个模型。有关详细信息,请点击相应模块以查看文档。 @@ -16,7 +16,6 @@ comments: true - [表格识别子产线](./table_recognition_v2.md) (可选) - [印章识别子产线](./seal_recognition.md) (可选) - [公式识别子产线](./formula_recognition.md) (可选) -- [图表解析模块]() (可选) 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 @@ -1006,456 +1005,424 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu input -待预测数据,支持多种输入类型,必填。 -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    +待预测数据,必填。 +如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + layout_detection_model_name -版面区域检测的模型名称。如果设置为None,将会使用产线默认模型。 +版面区域检测的模型名称。如果不设置,将会使用产线默认模型。 str -None + layout_detection_model_dir -版面区域检测模型的目录路径。如果设置为None,将会下载官方模型。 +版面区域检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + layout_threshold -版面模型得分阈值。 -
      -
    • float0-1 之间的任意浮点数;
    • -
    • dict{0:0.1} key为类别ID,value为该类别的阈值;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 0.5
    • -
    +版面模型得分阈值。0-1 之间的任意浮点数。如果不设置,将默认使用产线初始化的该参数值,初始化为 0.5。 -float|dict -None +float + layout_nms -版面区域检测模型是否使用NMS后处理。 +版面检测是否使用后处理NMS。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_unclip_ratio 版面区域检测模型检测框的扩张系数。 -
      -
    • float:任意大于 0 浮点数;
    • -
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 1.0
    • -
    +任意大于 0 浮点数。如果不设置,将默认使用产线初始化的该参数值,初始化为 1.0。 -float|Tuple[float,float]|dict -None +float + layout_merge_bboxes_mode -版面区域检测的重叠框过滤方式。 +版面检测中模型输出的检测框的合并处理模式。
      -
    • strlargesmall, union,分别表示重叠框过滤时选择保留大框,小框还是同时保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 large
    • -
    +
  • large,设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框;
  • +
  • small,设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框;
  • +
  • union,不进行框的过滤处理,内外框都保留;
  • +如果不设置,将默认使用产线初始化的该参数值,初始化为large。 -str|dict -None +str + chart_recognition_model_name -图表解析的模型名称。如果设置为None,将会使用产线默认模型。 +图表解析的模型名称。如果不设置,将会使用产线默认模型。 str -None + chart_recognition_model_dir -图表解析模型的目录路径。如果设置为None,将会下载官方模型。 +图表解析模型的目录路径。如果不设置,将会下载官方模型。 str -None + chart_recognition_batch_size -图表解析模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +图表解析模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + region_detection_model_name -文档图像版面子模块检测的模型名称。如果设置为None,将会使用产线默认模型。 +文档图像版面子模块检测的模型名称。如果不设置,将会使用产线默认模型。 str -None + region_detection_model_dir -文档图像版面子模块检测模型的目录路径。如果设置为None,将会下载官方模型。 +文档图像版面子模块检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_detection_model_name -文本检测模型的名称。如果设置为None,将会使用产线默认模型。 +文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_detection_model_dir -文本检测模型的目录路径。如果设置为None,将会下载官方模型。 +文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_det_limit_side_len -文本检测的最大边长度限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • -
    +文本检测的图像边长限制。 +大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 960int -None + text_det_limit_type - -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • -
    +文本检测的图像边长限制类型。支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 maxstr -None + text_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.3float -None + text_det_box_thresh -检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 2.0float -None + textline_orientation_model_name -文本行方向模型的名称。如果设置为None,将会使用产线默认模型。 +文本行方向模型的名称。如果不设置,将会使用产线默认模型。 str -None + textline_orientation_model_dir -文本行方向模型的目录路径。如果设置为None,将会下载官方模型。 +文本行方向模型的目录路径。如果不设置,将会下载官方模型。 str -None + textline_orientation_batch_size -文本行方向模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +文本行方向模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + text_recognition_model_name -文本识别模型的名称。如果设置为None,将会使用产线默认模型。 +文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_recognition_model_dir -文本识别模型的目录路径。如果设置为None,将会下载官方模型。 +文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + table_classification_model_name -表格分类模型的名称。如果设置为None,将会使用产线默认模型。 +表格分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + table_classification_model_dir -表格分类模型的目录路径。如果设置为None,将会下载官方模型。 +表格分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + wired_table_structure_recognition_model_name -有线表格结构识别模型的名称。如果设置为None,将会使用产线默认模型。 +有线表格结构识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + wired_table_structure_recognition_model_dir -有线表格结构识别模型的目录路径。如果设置为None,将会下载官方模型。 +有线表格结构识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + wireless_table_structure_recognition_model_name -无线表格结构识别模型的名称。如果设置为None,将会使用产线默认模型。 +无线表格结构识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + wireless_table_structure_recognition_model_dir -无线表格结构识别模型的目录路径。如果设置为None,将会下载官方模型。 +无线表格结构识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + wired_table_cells_detection_model_name -有线表格单元格检测模型的名称。如果设置为None,将会使用产线默认模型。 +有线表格单元格检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + wired_table_cells_detection_model_dir -有线表格单元格检测模型的目录路径。如果设置为None,将会下载官方模型。 +有线表格单元格检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + wireless_table_cells_detection_model_name -无线表格单元格检测模型的名称。如果设置为None,将会使用产线默认模型。 +无线表格单元格检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + wireless_table_cells_detection_model_dir -无线表格单元格检测模型的目录路径。如果设置为None,将会下载官方模型。 +无线表格单元格检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_detection_model_name -印章文本检测模型的名称。如果设置为None,将会使用产线默认模型。 +印章文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + seal_text_detection_model_dir -印章文本检测模型的目录路径。如果设置为None,将会下载官方模型。 +印章文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_det_limit_side_len 印章文本检测的图像边长限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • -
    +大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 736int -None + seal_det_limit_type -印章文本检测的图像边长限制类型。 -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • -
    +印章文本检测的图像边长限制类型。支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 minstr -None + seal_det_thresh -检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    - +检测像素阈值。输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.2float -None + seal_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + seal_det_unclip_ratio 印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.5float -None + seal_text_recognition_model_name -印章文本识别模型的名称。如果设置为None,将会使用产线默认模型。 +印章文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + seal_text_recognition_model_dir -印章文本识别模型的目录路径。如果设置为None,将会下载官方模型。 +印章文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_recognition_batch_size -印章文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +印章文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + seal_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +大于 0 的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + formula_recognition_model_name -公式识别模型的名称。如果设置为None,将会使用产线默认模型。 +公式识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + formula_recognition_model_dir -公式识别模型的目录路径。如果设置为None,将会下载官方模型。 +公式识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + formula_recognition_batch_size -公式识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1。 +公式识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + + + +use_textline_orientation +是否加载并使用文本行方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为True。 +bool + use_seal_recognition -是否加载印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用印章识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_table_recognition -是否加载表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用表格识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_formula_recognition -是否加载公式识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用公式识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_chart_recognition -是否加载图表解析模型。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用图表识别子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_region_detection -是否加载文档图像版面子模块检测模型。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档区域检测子产线。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1463,11 +1430,10 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -1495,10 +1461,10 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1510,7 +1476,7 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu paddlex_config PaddleX产线配置文件路径。 str -None + @@ -1640,7 +1606,7 @@ for item in markdown_images:
    • float0-1 之间的任意浮点数;
    • dict{0:0.1} key为类别ID,value为该类别的阈值;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 0.5
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 0.5
    float|dict @@ -1648,7 +1614,7 @@ for item in markdown_images: layout_nms -版面区域检测模型是否使用NMS后处理。 +版面检测是否使用后处理NMS。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None @@ -1658,8 +1624,8 @@ for item in markdown_images:
    • float:任意大于 0 浮点数;
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • -
    • 字典, 字典的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)}, 表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 1.0
    • +
    • dict,dict的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)},表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 1.0
    float|Tuple[float,float]|dict @@ -1669,9 +1635,9 @@ for item in markdown_images: layout_merge_bboxes_mode 版面区域检测的重叠框过滤方式。
      -
    • strlargesmall, union,分别表示重叠框过滤时选择保留大框,小框还是同时保留
    • -
    • dict, 字典的key为int类型,代表cls_id, value为str类型, 如{0: "large", 2: "small"}, 表示对第0类别检测框使用large模式,对第2类别检测框使用small模式
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 large
    • +
    • strlargesmallunion,分别表示重叠框过滤时选择保留大框,小框还是同时保留;
    • +
    • dict: dict的key为int类型,代表cls_id,value为str类型,如{0: "large", 2: "small"},表示对第0类别检测框使用large模式,对第2类别检测框使用small模式;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 large
    str|dict @@ -1745,10 +1711,10 @@ for item in markdown_images: text_det_limit_side_len -文本检测的最大边长度限制。 +文本检测的图像边长限制。
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 960
    int @@ -1756,10 +1722,10 @@ for item in markdown_images: text_det_limit_type - +文本检测的图像边长限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 max
    str @@ -1769,8 +1735,8 @@ for item in markdown_images: text_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.3
  • float None @@ -1779,8 +1745,8 @@ for item in markdown_images: text_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1789,8 +1755,8 @@ for item in markdown_images: text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 2.0
  • float None @@ -1835,8 +1801,8 @@ for item in markdown_images: text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None @@ -1918,7 +1884,7 @@ for item in markdown_images: 印章文本检测的图像边长限制。
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 736
    int @@ -1928,8 +1894,8 @@ for item in markdown_images: seal_det_limit_type 印章文本检测的图像边长限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 min
    str @@ -1939,8 +1905,8 @@ for item in markdown_images: seal_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    +
  • float:大于 0 的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.2
  • float None @@ -1949,8 +1915,8 @@ for item in markdown_images: seal_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1959,8 +1925,8 @@ for item in markdown_images: seal_det_unclip_ratio 印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.5
  • float None @@ -1987,8 +1953,8 @@ for item in markdown_images: seal_rec_score_thresh 印章文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None @@ -2013,49 +1979,55 @@ for item in markdown_images: use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +bool +None + + +use_textline_orientation +是否加载并使用文本行方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_seal_recognition -是否加载印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用印章识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_table_recognition -是否加载表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用表格识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_formula_recognition -是否加载公式识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用公式识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_chart_recognition -是否加载图表解析模型。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用图表识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_region_detection -是否加载文档图像版面子模块检测模型。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档区域检测子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -2063,7 +2035,7 @@ for item in markdown_images:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -2091,14 +2063,14 @@ for item in markdown_images: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -2134,19 +2106,13 @@ for item in markdown_images:
    • Python Var:如 numpy.ndarray 表示的图像数据
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]。
    Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -2183,6 +2149,18 @@ for item in markdown_images: None +use_chart_recognition +是否加载并使用图表识别子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +bool +None + + +use_region_detection +是否加载并使用文档区域检测子产线。如果设置为None,将默认使用产线初始化的该参数值,初始化为True。 +bool +None + + layout_threshold 与实例化时的参数相同。 float|dict @@ -2278,6 +2256,65 @@ for item in markdown_images: float None + +use_wired_table_cells_trans_to_html +是否启用有线表单元格检测结果直转HTML,默认False,启用则直接基于有线表单元格检测结果的几何关系构建HTML。 +
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为False
    • +
    +float|None +False + + +use_wireless_table_cells_trans_to_html +是否启用无线表单元格检测结果直转HTML,默认False,启用则直接基于无线表单元格检测结果的几何关系构建HTML。 +
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为False
    • +
    +float|None +False + + +use_table_orientation_classify +是否启用表格使用表格方向分类,启用时当图像中的表格存在90/180/270度旋转时,能够将方向校正并正确完成表格识别。 +
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为True
    • +
    +bool|None +True + + +use_ocr_results_with_table_cells +是否启用单元格切分OCR,启用时会基于单元格预测结果对OCR检测结果进行切分和重识别,避免出现文字缺失情况。 +
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为True
    • +
    +bool|None +True + + +use_e2e_wired_table_rec_model +是否启用有线表端到端表格识别模式,启用则不使用单元格检测模型,只使用表格结构识别模型。 +
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为False
    • +
    +bool|None +False + + +use_e2e_wireless_table_rec_model +是否启用无线表端到端表格识别模式,启用则不使用单元格检测模型,只使用表格结构识别模型。
      +
    • boolTrue 或者 False
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为False
    • +
    +bool|None +True + @@ -2299,19 +2336,19 @@ for item in markdown_images: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -2319,19 +2356,19 @@ for item in markdown_images: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -2339,15 +2376,15 @@ for item in markdown_images: 将中间各个模块的可视化图像保存在png格式的图像 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 save_to_markdown() -将图像或者PDF文件中的每一页分别保存为markdown格式的文件 +将图像或者PDF文件中的每一页分别保存为markdown格式的文件。 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -2355,7 +2392,7 @@ for item in markdown_images: 将文件中的表格保存为html格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -2363,7 +2400,7 @@ for item in markdown_images: 将文件中的表格保存为xlsx格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -2371,8 +2408,8 @@ for item in markdown_images: 将多页Markdown内容拼接为单一文档 markdown_list list -包含每一页Markdown数据的列表 -返回处理后的Markdown文本和图像列表 +包含每一页Markdown数据的列表。 +返回处理后的Markdown文本和图像列表。 @@ -2388,7 +2425,7 @@ for item in markdown_images: - `use_table_recognition`: `(bool)` 控制是否启用表格识别子产线 - `use_formula_recognition`: `(bool)` 控制是否启用公式识别子产线 - - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果字典,仅当`use_doc_preprocessor=True`时存在 + - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果dict,仅当`use_doc_preprocessor=True`时存在 - `input_path`: `(str)` 文档预处理子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None`,此处为`None` - `page_index`: `None`,此处的输入为`numpy.ndarray`,所以值为`None` - `model_settings`: `(Dict[str, bool])` 文档预处理子产线的模型配置参数 @@ -2396,7 +2433,7 @@ for item in markdown_images: - `use_doc_unwarping`: `(bool)` 控制是否启用文本图像扭曲矫正子模块 - `angle`: `(int)` 文档图像方向分类子模块的预测结果,启用时返回实际角度值 - - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。 + - `parsing_res_list`: `(List[Dict])` 解析结果的列表,每个元素为一个dict,列表顺序为解析后的阅读顺序。 - `block_bbox`: `(np.ndarray)` 版面区域的边界框。 - `block_label`: `(str)` 版面区域的标签,例如`text`, `table`等。 - `block_content`: `(str)` 内容为版面区域内的内容。 @@ -2410,7 +2447,7 @@ for item in markdown_images: - - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的字典 + - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` 全局 OCR 结果的dict - `input_path`: `(Union[str, None])` 图像OCR子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None` - `page_index`: `None`,此处的输入为`numpy.ndarray`,所以值为`None` - `model_settings`: `(Dict)` OCR子产线的模型配置参数 @@ -2431,12 +2468,12 @@ for item in markdown_images: - `rec_scores`: `(List[float])` 文本识别的置信度列表,已按`text_rec_score_thresh`过滤 - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的文本检测框列表,格式同`dt_polys` - - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个字典 + - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表,每个元素为一个dict - `rec_formula`: `(str)` 公式识别结果 - `rec_polys`: `(numpy.ndarray)` 公式检测框,shape为(4, 2),dtype为int16 - `formula_region_id`: `(int)` 公式所在的区域编号 - - `seal_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 印章识别结果列表,每个元素为一个字典 + - `seal_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 印章识别结果列表,每个元素为一个dict - `input_path`: `(str)` 印章图像的输入路径 - `page_index`: `None`,此处的输入为`numpy.ndarray`,所以值为`None` - `model_settings`: `(Dict)` 印章识别子产线的模型配置参数 @@ -2449,7 +2486,7 @@ for item in markdown_images: - `rec_polys`: `(List[numpy.ndarray])` 经过置信度过滤的印章检测框列表,格式同`dt_polys` - `rec_boxes`: `(numpy.ndarray)` 检测框的矩形边界框数组,shape为(n, 4),dtype为int16。每一行表示一个矩形 - - `table_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 表格识别结果列表,每个元素为一个字典 + - `table_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 表格识别结果列表,每个元素为一个dict - `cell_box_list`: `(List[numpy.ndarray])` 表格单元格的边界框列表 - `pred_html`: `(str)` 表格的HTML格式字符串 - `table_ocr_pred`: `(dict)` 表格的OCR识别结果 @@ -2493,9 +2530,9 @@ for item in markdown_images: -- `json` 属性获取的预测结果为字典类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `layout_det_res`、`overall_ocr_res`、`text_paragraphs_ocr_res`、`formula_res_region1`、`table_cell_img` 和 `seal_res_region1`,对应的值是 `Image.Image` 对象:分别用于显示版面区域检测、OCR、OCR文本段落、公式、表格和印章结果的可视化图像。如果没有使用可选模块,则字典中只包含 `layout_det_res`。 -- `markdown` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `markdown_texts` 、 `markdown_images`和`page_continuation_flags`,对应的值分别是 markdown 文本,在 Markdown 中显示的图像(`Image.Image` 对象)和用于标识当前页面第一个元素是否为段开始以及最后一个元素是否为段结束的bool元组。 +- `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `layout_det_res`、`overall_ocr_res`、`text_paragraphs_ocr_res`、`formula_res_region1`、`table_cell_img` 和 `seal_res_region1`,对应的值是 `Image.Image` 对象:分别用于显示版面区域检测、OCR、OCR文本段落、公式、表格和印章结果的可视化图像。如果没有使用可选模块,则dict中只包含 `layout_det_res`。 +- `markdown` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `markdown_texts` 、 `markdown_images`和`page_continuation_flags`,对应的值分别是 markdown 文本,在 Markdown 中显示的图像(`Image.Image` 对象)和用于标识当前页面第一个元素是否为段开始以及最后一个元素是否为段结束的bool元组。 @@ -3036,7 +3073,7 @@ paddleocr pp_structurev3 --paddlex_config PP-StructureV3.yaml ... 4. 在 Python API 中加载产线配置文件 -初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置字典,PaddleOCR 会读取其中的内容作为产线配置。示例如下: +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: ```python from paddleocr import PPStructureV3 diff --git a/docs/version3.x/pipeline_usage/doc_preprocessor.en.md b/docs/version3.x/pipeline_usage/doc_preprocessor.en.md index d2b4b9f5e1a7229121edf3d60eff89edff7da875..5799dceb0e36c78a2ca986ee790f7dc7e3762d61 100644 --- a/docs/version3.x/pipeline_usage/doc_preprocessor.en.md +++ b/docs/version3.x/pipeline_usage/doc_preprocessor.en.md @@ -150,73 +150,69 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu input -The data to be predicted, supporting multiple input types. This parameter is required. -
      -
    • Python Var: For example, image data represented as numpy.ndarray.
    • -
    • str: For example, the local path of an image file or PDF file: /root/data/img.jpg; or a URL link, such as the network URL of an image file or PDF file: example; or a local directory, which should contain the images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files need to be specified to a specific file path).
    • -
    • List: The list elements should be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
    • -
    +The data to be predicted. This parameter is required. +For example, the local path of an image file or PDF file: /root/data/img.jpg; or a URL link, such as the network URL of an image file or PDF file: example; or a local directory, which should contain the images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files need to be specified to a specific file path). -Python Var|str|list +str save_path -Specify the path to save the inference result file. If set to None, the inference result will not be saved locally. +Specify the path to save the inference result file. If not set, the inference result will not be saved locally. str -None + doc_orientation_classify_model_name -The name of the document orientation classification model. If set to None, the pipeline's default model will be used. +The name of the document orientation classification model. If not set, the pipeline's default model will be used. str -None + doc_orientation_classify_model_dir -The directory path of the document orientation classification model. If set to None, the official model will be downloaded. +The directory path of the document orientation classification model. If not set, the official model will be downloaded. str -None + doc_unwarping_model_name -The name of the text image unwarping model. If set to None, the pipeline's default model will be used. +The name of the text image unwarping model. If not set, the pipeline's default model will be used. str -None + doc_unwarping_model_dir -The directory path of the text image unwarping model. If set to None, the official model will be downloaded. +The directory path of the text image unwarping model. If not set, the official model will be downloaded. str -None + use_doc_orientation_classify -Whether to load the document orientation classification module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. +Whether to load and use the document orientation classification module. If not set, the parameter value initialized by the pipeline will be used by default, initialized as True. bool -None + use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. +Whether to load and use the text image unwarping module. If not set, the parameter value initialized by the pipeline will be used by default, initialized as True. bool -None + device -The device used for inference. Support for specifying specific card numbers. +The device used for inference. Support for specifying specific card numbers:
      -
    • CPU: For example, cpu indicates using the CPU for inference.
    • -
    • GPU: For example, gpu:0 indicates using the first GPU for inference.
    • -
    • NPU: For example, npu:0 indicates using the first NPU for inference.
    • -
    • XPU: For example, xpu:0 indicates using the first XPU for inference.
    • -
    • MLU: For example, mlu:0 indicates using the first MLU for inference.
    • -
    • DCU: For example, dcu:0 indicates using the first DCU for inference.
    • -
    • None: If set to None, the parameter value initialized by the pipeline will be used by default. During initialization, the local GPU 0 device will be prioritized; if not available, the CPU device will be used.
    • +
    • CPU: For example, cpu indicates using the CPU for inference;
    • +
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • +
    • NPU: For example, npu:0 indicates using the first NPU for inference;
    • +
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • +
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • +
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. str -None + enable_hpi @@ -244,9 +240,9 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -258,7 +254,7 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu paddlex_config Path to PaddleX pipeline configuration file. str -None + @@ -277,7 +273,7 @@ The visualization results are saved under the `save_path`. The visualization res ### 2.2 Integration via Python Script -The command-line approach is for quick experience and viewing results. Generally, in projects, integration through code is often required. You can achieve rapid inference in production lines with just a few lines of code. The inference code is as follows: +The command-line approach is for quick experience and viewing results. Generally, in projects, integration through code is often required. You can achieve rapid inference in pipelines with just a few lines of code. The inference code is as follows: ```python from paddleocr import DocPreprocessor @@ -333,27 +329,27 @@ In the above Python script, the following steps are executed: use_doc_orientation_classify -Whether to load the document orientation classification module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. +Whether to load and use the document orientation classification module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. bool None use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. +Whether to load and use the text image unwarping module. If set to None, the parameter value initialized by the pipeline will be used by default, initialized as True. bool None device -The device used for inference. Support for specifying specific card numbers. +The device used for inference. Support for specifying specific card numbers:
      -
    • CPU: For example, cpu indicates using the CPU for inference.
    • -
    • GPU: For example, gpu:0 indicates using the first GPU for inference.
    • -
    • NPU: For example, npu:0 indicates using the first NPU for inference.
    • -
    • XPU: For example, xpu:0 indicates using the first XPU for inference.
    • -
    • MLU: For example, mlu:0 indicates using the first MLU for inference.
    • -
    • DCU: For example, dcu:0 indicates using the first DCU for inference.
    • -
    • None: If set to None, the parameter value initialized by the pipeline will be used by default. During initialization, the local GPU 0 device will be prioritized; if not available, the CPU device will be used.
    • +
    • CPU: For example, cpu indicates using the CPU for inference;
    • +
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • +
    • NPU: For example, npu:0 indicates using the first NPU for inference;
    • +
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • +
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • +
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str @@ -381,13 +377,13 @@ In the above Python script, the following steps are executed: precision The computational precision, such as fp32, fp16. str -fp32 +"fp32" enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -423,8 +419,8 @@ The following are the parameters and their descriptions of the `predict()` metho input The data to be predicted, supporting multiple input types. This parameter is required.
      -
    • Python Var: For example, image data represented as numpy.ndarray.
    • -
    • str: For example, the local path of an image file or PDF file: /root/data/img.jpg; or a URL link, such as the network URL of an image file or PDF file: example; or a local directory, which should contain the images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files need to be specified to a specific file path).
    • +
    • Python Var: For example, image data represented as numpy.ndarray;
    • +
    • str: For example, the local path of an image file or PDF file: /root/data/img.jpg; or a URL link, such as the network URL of an image file or PDF file: example; or a local directory, which should contain the images to be predicted, such as the local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files need to be specified to a specific file path);
    • List: The list elements should be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
    @@ -432,12 +428,6 @@ The following are the parameters and their descriptions of the `predict()` metho -device -Same as the parameter during instantiation. -str -None - - use_doc_orientation_classify Whether to use the document orientation classification module during inference. bool @@ -521,12 +511,12 @@ The following are the parameters and their descriptions of the `predict()` metho - `page_index`: `(Union[int, None])` If the input is a PDF file, it indicates the current page number of the PDF; otherwise, it is `None` - - `model_settings`: `(Dict[str, bool])` Model parameters configured for the production line + - `model_settings`: `(Dict[str, bool])` Model parameters configured for the pipeline - `use_doc_orientation_classify`: `(bool)` Controls whether to enable the document orientation classification module - `use_doc_unwarping`: `(bool)` Controls whether to enable the text image rectification module - - `angle`: `(int)` The prediction result of the document orientation classification. When enabled, the value is one of [0, 90, 180, 270]; when disabled, it is -1 + - `angle`: `(int)` The prediction result of the document orientation classification. When enabled, the value is one of [0,90,180,270]; when disabled, it is -1 - Calling the `save_to_json()` method will save the above content to the specified `save_path`. If a directory is specified, the saved path will be `save_path/{your_img_basename}.json`. If a file is specified, it will be saved directly to that file. Since JSON files do not support saving numpy arrays, `numpy.array` types will be converted to list form. @@ -556,15 +546,15 @@ The following are the parameters and their descriptions of the `predict()` metho ## 3. Development Integration/Deployment -If the production line meets your requirements for inference speed and accuracy, you can proceed directly to development integration/deployment. +If the pipeline meets your requirements for inference speed and accuracy, you can proceed directly to development integration/deployment. -If you need to apply the production line directly to your Python project, you can refer to the example code in [2.2 Python Script Integration](#22-python脚本方式集成). +If you need to apply the pipeline directly to your Python project, you can refer to the example code in [2.2 Python Script Integration](#22-python脚本方式集成). In addition, PaddleOCR also provides two other deployment methods, which are detailed as follows: 🚀 High-performance inference: In actual production environments, many applications have strict performance requirements (especially response speed) to ensure efficient system operation and smooth user experience. To this end, PaddleOCR provides high-performance inference functionality, aiming to deeply optimize model inference and pre/post-processing to achieve significant end-to-end process acceleration. For detailed high-performance inference procedures, please refer to the [High-Performance Inference Guide](../deployment/high_performance_inference.md). -☁️ Service-oriented deployment: Service-oriented deployment is a common form of deployment in actual production environments. By encapsulating inference functions as services, clients can access these services through network requests to obtain inference results. For detailed production line service-oriented deployment procedures, please refer to the [Service-Oriented Deployment Guide](../deployment/serving.md). +☁️ Service-oriented deployment: Service-oriented deployment is a common form of deployment in actual production environments. By encapsulating inference functions as services, clients can access these services through network requests to obtain inference results. For detailed pipeline service-oriented deployment procedures, please refer to the [Service-Oriented Deployment Guide](../deployment/serving.md). Below are the API references for basic service-oriented deployment and examples of multi-language service calls: @@ -657,7 +647,7 @@ Below are the API references for basic service-oriented deployment and examples file string -The URL of an image file or PDF file accessible to the server, or the Base64 encoding result of the content of the above types of files. By default, for PDF files with more than 10 pages, only the first 10 pages will be processed.
    To remove the page limit, please add the following configuration to the production line configuration file: +The URL of an image file or PDF file accessible to the server, or the Base64 encoding result of the content of the above types of files. By default, for PDF files with more than 10 pages, only the first 10 pages will be processed.
    To remove the page limit, please add the following configuration to the pipeline configuration file:
    Serving:
       extra:
         max_num_input_imgs: null
    @@ -674,13 +664,13 @@ Below are the API references for basic service-oriented deployment and examples
     
     useDocOrientationClassify
     boolean | null
    -Please refer to the description of the use_doc_orientation_classify parameter in the predict method of the production line object.
    +Please refer to the description of the use_doc_orientation_classify parameter in the predict method of the pipeline object.
     No
     
     
     useDocUnwarping
     boolean | null
    -Please refer to the description of the use_doc_unwarping parameter in the predict method of the production line object.
    +Please refer to the description of the use_doc_unwarping parameter in the predict method of the pipeline object.
     No
     
     
    @@ -727,7 +717,7 @@ Below are the API references for basic service-oriented deployment and examples
     
     prunedResult
     object
    -A simplified version of the res field in the JSON representation of the result generated by the predict method of the production line object, with the input_path and page_index fields removed.
    +A simplified version of the res field in the JSON representation of the result generated by the predict method of the pipeline object, with the input_path and page_index fields removed.
     
     
     docPreprocessingImage
    diff --git a/docs/version3.x/pipeline_usage/doc_preprocessor.md b/docs/version3.x/pipeline_usage/doc_preprocessor.md
    index 5a3c5b03093afdcf1f7e31556a410907d29c1d95..26f802e311b2fddc208231052490592d7a6ad1fa 100644
    --- a/docs/version3.x/pipeline_usage/doc_preprocessor.md
    +++ b/docs/version3.x/pipeline_usage/doc_preprocessor.md
    @@ -152,63 +152,59 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu
     
     
     input
    -待预测数据,支持多种输入类型,必填。
    -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    +待预测数据,必填。 +如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -216,11 +212,10 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -248,10 +243,10 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -263,7 +258,7 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu paddlex_config PaddleX产线配置文件路径。 str -None + @@ -315,45 +310,45 @@ for res in output: doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 str None use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -361,7 +356,7 @@ for res in output:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -389,14 +384,14 @@ for res in output: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -432,21 +427,15 @@ for res in output: input 待预测数据,支持多种输入类型,必填。
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -560,7 +549,7 @@ for res in output: - `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键为 `preprocessed_img`,对应的值是 `Image.Image` 对象:用于显示 doc_preprocessor 结果的可视化图像。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键为 `preprocessed_img`,对应的值是 `Image.Image` 对象:用于显示 doc_preprocessor 结果的可视化图像。 ## 3. 开发集成/部署 diff --git a/docs/version3.x/pipeline_usage/doc_understanding.en.md b/docs/version3.x/pipeline_usage/doc_understanding.en.md index 4441b16e1b08b3024737355dddc617d290146dbf..d42902525dd45e05a4af7c67c5ea77ebbbf95346 100644 --- a/docs/version3.x/pipeline_usage/doc_understanding.en.md +++ b/docs/version3.x/pipeline_usage/doc_understanding.en.md @@ -1,7 +1,5 @@ --- - comments: true - --- # Document Understanding Pipeline Usage Tutorial @@ -62,7 +60,7 @@ Before using the document understanding pipeline locally, ensure that you have c Experience the doc_understanding pipeline with just one command line: ```bash -paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出'}" +paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容,以markdown格式输出'}" ```
    The command line supports more parameter settings, click to expand for a detailed explanation of the command line parameters @@ -78,41 +76,39 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo input -Data to be predicted, supports dictionary type input, required. -
      -
    • Python Dict: The input format for PP-DocBee is: {"image":/path/to/image, "query": user question}, representing the input image and corresponding user question.
    • -
    +Data to be predicted, required. +"{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': 'Recognize the content of this table and output it in markdown format'}". -Python Var|str|list +str save_path -Specify the path for saving the inference result file. If set to None, the inference result will not be saved locally. +Specify the path for saving the inference result file. If not set, the inference result will not be saved locally. str -None + doc_understanding_model_name -The name of the document understanding model. If set to None, the default model of the pipeline will be used. +The name of the document understanding model. If not set, the default model of the pipeline will be used. str -None + doc_understanding_model_dir -The directory path of the document understanding model. If set to None, the official model will be downloaded. +The directory path of the document understanding model. If not set, the official model will be downloaded. str -None + doc_understanding_batch_size -The batch size of the document understanding model. If set to None, the default batch size will be set to 1. +The batch size of the document understanding model. If not set, the default batch size will be set to 1. int -None + device -The device used for inference. Supports specifying a specific card number. +The device used for inference. Supports specifying a specific card number:
    • CPU: For example, cpu indicates using the CPU for inference;
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • @@ -120,11 +116,10 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    • -
    • None: If set to None, the initialized value of this parameter will be used by default, which will preferentially use the local GPU device 0, or the CPU device if none is available.
    • -
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. str -None + enable_hpi @@ -152,9 +147,9 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -166,7 +161,7 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo paddlex_config Path to PaddleX pipeline configuration file. str -None + @@ -176,7 +171,7 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo The results will be printed to the terminal, and the default configuration of the doc_understanding pipeline will produce the following output: ```bash -{'res': {'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出', 'result': '| 名次 | 国家/地区 | 金牌 | 银牌 | 铜牌 | 奖牌总数 |\n| --- | --- | --- | --- | --- | --- |\n| 1 | 中国(CHN) | 48 | 22 | 30 | 100 |\n| 2 | 美国(USA) | 36 | 39 | 37 | 112 |\n| 3 | 俄罗斯(RUS) | 24 | 13 | 23 | 60 |\n| 4 | 英国(GBR) | 19 | 13 | 19 | 51 |\n| 5 | 德国(GER) | 16 | 11 | 14 | 41 |\n| 6 | 澳大利亚(AUS) | 14 | 15 | 17 | 46 |\n| 7 | 韩国(KOR) | 13 | 11 | 8 | 32 |\n| 8 | 日本(JPN) | 9 | 8 | 8 | 25 |\n| 9 | 意大利(ITA) | 8 | 9 | 10 | 27 |\n| 10 | 法国(FRA) | 7 | 16 | 20 | 43 |\n| 11 | 荷兰(NED) | 7 | 5 | 4 | 16 |\n| 12 | 乌克兰(UKR) | 7 | 4 | 11 | 22 |\n| 13 | 肯尼亚(KEN) | 6 | 4 | 6 | 16 |\n| 14 | 西班牙(ESP) | 5 | 11 | 3 | 19 |\n| 15 | 牙买加(JAM) | 5 | 4 | 2 | 11 |\n'}} +{'res': {'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容,以markdown格式输出', 'result': '| 名次 | 国家/地区 | 金牌 | 银牌 | 铜牌 | 奖牌总数 |\n| --- | --- | --- | --- | --- | --- |\n| 1 | 中国(CHN) | 48 | 22 | 30 | 100 |\n| 2 | 美国(USA) | 36 | 39 | 37 | 112 |\n| 3 | 俄罗斯(RUS) | 24 | 13 | 23 | 60 |\n| 4 | 英国(GBR) | 19 | 13 | 19 | 51 |\n| 5 | 德国(GER) | 16 | 11 | 14 | 41 |\n| 6 | 澳大利亚(AUS) | 14 | 15 | 17 | 46 |\n| 7 | 韩国(KOR) | 13 | 11 | 8 | 32 |\n| 8 | 日本(JPN) | 9 | 8 | 8 | 25 |\n| 9 | 意大利(ITA) | 8 | 9 | 10 | 27 |\n| 10 | 法国(FRA) | 7 | 16 | 20 | 43 |\n| 11 | 荷兰(NED) | 7 | 5 | 4 | 16 |\n| 12 | 乌克兰(UKR) | 7 | 4 | 11 | 22 |\n| 13 | 肯尼亚(KEN) | 6 | 4 | 6 | 16 |\n| 14 | 西班牙(ESP) | 5 | 11 | 3 | 19 |\n| 15 | 牙买加(JAM) | 5 | 4 | 2 | 11 |\n'}} ``` ### 2.2 Python Script Integration @@ -190,7 +185,7 @@ pipeline = DocUnderstanding() output = pipeline.predict( { "image": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png", - "query": "识别这份表格的内容, 以markdown格式输出" + "query": "识别这份表格的内容,以markdown格式输出" } ) for res in output: @@ -232,7 +227,7 @@ In the above Python script, the following steps are performed: device -The device used for inference. Supports specifying a specific card number. +The device used for inference. Supports specifying a specific card number:
    • CPU: For example, cpu indicates using the CPU for inference;
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • @@ -240,7 +235,7 @@ In the above Python script, the following steps are performed:
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    • -
    • None: If set to None, the initialized value of this parameter will be used by default, which will preferentially use the local GPU device 0, or the CPU device if none is available.
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str @@ -268,13 +263,13 @@ In the above Python script, the following steps are performed: precision Calculation precision, such as fp32, fp16. str -fp32 +"fp32" enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -316,11 +311,7 @@ Below are the parameters and their descriptions for the `predict()` method: Python Dict - -device -Same as the parameter during instantiation. -str -None + (3) Process the prediction results. The prediction result for each sample is a corresponding Result object, which supports printing and saving as a `json` file: @@ -341,19 +332,19 @@ Below are the parameters and their descriptions for the `predict()` method: Print the result to the terminal format_json bool -Whether to format the output content using JSON indentation +Whether to format the output content using JSON indentation. True indent int -Specifies the indentation level to beautify the output JSON data, making it more readable, effective only when format_json is True +Specifies the indentation level to beautify the output JSON data, making it more readable, effective only when format_json is True. 4 ensure_ascii bool -Controls whether to escape non-ASCII characters into Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True +Controls whether to escape non-ASCII characters into Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True. False @@ -367,13 +358,13 @@ Below are the parameters and their descriptions for the `predict()` method: indent int -Specifies the indentation level to beautify the output JSON data, making it more readable, effective only when format_json is True +Specifies the indentation level to beautify the output JSON data, making it more readable, effective only when format_json is True. 4 ensure_ascii bool -Controls whether to escape non-ASCII characters into Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True +Controls whether to escape non-ASCII characters into Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True. False diff --git a/docs/version3.x/pipeline_usage/doc_understanding.md b/docs/version3.x/pipeline_usage/doc_understanding.md index f58a3998f3a3b00f0010b3a180a6db90fe9cb878..39ae0b9ef841ce89383cbb600e0ddc88df9de8d5 100644 --- a/docs/version3.x/pipeline_usage/doc_understanding.md +++ b/docs/version3.x/pipeline_usage/doc_understanding.md @@ -45,7 +45,7 @@ comments: true -注:以上模型总分为内部评估集模型测试结果,内部评估集所有图像分辨率 (height, width) 为 (1680,1204),共1196条数据,包括了财报、法律法规、理工科论文、说明书、文科论文、合同、研报等场景,暂时未有计划公开。 +注:以上模型总分为内部评估集模型测试结果,内部评估集所有图像分辨率 (height,width) 为 (1680,1204),共1196条数据,包括了财报、法律法规、理工科论文、说明书、文科论文、合同、研报等场景,暂时未有计划公开。

    @@ -60,7 +60,7 @@ comments: true 一行命令即可快速体验 doc_understanding 产线效果: ```bash -paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出'}" +paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容,以markdown格式输出'}" ```
    命令行支持更多参数设置,点击展开以查看命令行参数的详细说明 @@ -76,43 +76,38 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo input -待预测数据,支持多种输入类型,必填。 -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    +待预测数据,必填。如"{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容,以markdown格式输出'}"。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + doc_understanding_model_name -文档理解模型的名称。如果设置为None, 将会使用产线默认模型。 +文档理解模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_understanding_model_dir -文档理解模型的目录路径。如果设置为None, 将会下载官方模型。 +文档理解模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_understanding_batch_size -文档理解模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +文档理解模型的批处理大小。如果设置为None,将默认设置批处理大小为1int -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -120,11 +115,10 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -152,10 +146,10 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -167,7 +161,7 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo paddlex_config PaddleX产线配置文件路径。 str -None + @@ -177,7 +171,7 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo 运行结果会被打印到终端上,默认配置的 doc_understanding 产线的运行结果如下: ```bash -{'res': {'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出', 'result': '| 名次 | 国家/地区 | 金牌 | 银牌 | 铜牌 | 奖牌总数 |\n| --- | --- | --- | --- | --- | --- |\n| 1 | 中国(CHN) | 48 | 22 | 30 | 100 |\n| 2 | 美国(USA) | 36 | 39 | 37 | 112 |\n| 3 | 俄罗斯(RUS) | 24 | 13 | 23 | 60 |\n| 4 | 英国(GBR) | 19 | 13 | 19 | 51 |\n| 5 | 德国(GER) | 16 | 11 | 14 | 41 |\n| 6 | 澳大利亚(AUS) | 14 | 15 | 17 | 46 |\n| 7 | 韩国(KOR) | 13 | 11 | 8 | 32 |\n| 8 | 日本(JPN) | 9 | 8 | 8 | 25 |\n| 9 | 意大利(ITA) | 8 | 9 | 10 | 27 |\n| 10 | 法国(FRA) | 7 | 16 | 20 | 43 |\n| 11 | 荷兰(NED) | 7 | 5 | 4 | 16 |\n| 12 | 乌克兰(UKR) | 7 | 4 | 11 | 22 |\n| 13 | 肯尼亚(KEN) | 6 | 4 | 6 | 16 |\n| 14 | 西班牙(ESP) | 5 | 11 | 3 | 19 |\n| 15 | 牙买加(JAM) | 5 | 4 | 2 | 11 |\n'}} +{'res': {'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容,以markdown格式输出', 'result': '| 名次 | 国家/地区 | 金牌 | 银牌 | 铜牌 | 奖牌总数 |\n| --- | --- | --- | --- | --- | --- |\n| 1 | 中国(CHN) | 48 | 22 | 30 | 100 |\n| 2 | 美国(USA) | 36 | 39 | 37 | 112 |\n| 3 | 俄罗斯(RUS) | 24 | 13 | 23 | 60 |\n| 4 | 英国(GBR) | 19 | 13 | 19 | 51 |\n| 5 | 德国(GER) | 16 | 11 | 14 | 41 |\n| 6 | 澳大利亚(AUS) | 14 | 15 | 17 | 46 |\n| 7 | 韩国(KOR) | 13 | 11 | 8 | 32 |\n| 8 | 日本(JPN) | 9 | 8 | 8 | 25 |\n| 9 | 意大利(ITA) | 8 | 9 | 10 | 27 |\n| 10 | 法国(FRA) | 7 | 16 | 20 | 43 |\n| 11 | 荷兰(NED) | 7 | 5 | 4 | 16 |\n| 12 | 乌克兰(UKR) | 7 | 4 | 11 | 22 |\n| 13 | 肯尼亚(KEN) | 6 | 4 | 6 | 16 |\n| 14 | 西班牙(ESP) | 5 | 11 | 3 | 19 |\n| 15 | 牙买加(JAM) | 5 | 4 | 2 | 11 |\n'}} ``` ### 2.2 Python脚本方式集成 @@ -191,7 +185,7 @@ pipeline = DocUnderstanding() output = pipeline.predict( { "image": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png", - "query": "识别这份表格的内容, 以markdown格式输出" + "query": "识别这份表格的内容,以markdown格式输出" } ) for res in output: @@ -201,7 +195,7 @@ for res in output: 在上述 Python 脚本中,执行了如下几个步骤: -(1)通过 `DocUnderstanding()` 实例化 文档理解产线 产线对象,具体参数说明如下: +(1)通过 `DocUnderstanding()` 实例化文档理解产线产线对象,具体参数说明如下: @@ -215,25 +209,25 @@ for res in output: - + - + - + - @@ -255,7 +249,7 @@ for res in output: - + @@ -269,14 +263,14 @@ for res in output: - + - - + @@ -310,19 +304,14 @@ for res in output: - - - - - -
    doc_understanding_model_name文档理解模型的名称。如果设置为None, 将会使用产线默认模型。文档理解模型的名称。如果设置为None,将会使用产线默认模型。 str None
    doc_understanding_model_dir文档理解模型的目录路径。如果设置为None, 将会下载官方模型。文档理解模型的目录路径。如果设置为None,将会下载官方模型。 str None
    doc_understanding_batch_size文档理解模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1文档理解模型的批处理大小。如果设置为None,将默认设置批处理大小为1 int None
    device用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -241,7 +235,7 @@ for res in output:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str
    use_tensorrt是否使用 TensorRT 进行推理加速。是否使用TensorRT进行推理加速。 bool False
    precision 计算精度,如 fp32、fp16。 strfp32"fp32"
    enable_mkldnn是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 boolNoneTrue
    cpu_threads
    input待预测数据,目前仅支持字典类型的输入 +待预测数据,目前仅支持dict类型的输入
      -
    • Python Dict:如PP-DocBee的输入形式为: {"image":/path/to/image, "query": user question} ,分别表示输入的图像和对应的用户问题
    • +
    • Python Dict:如PP-DocBee的输入形式为: {"image":/path/to/image, "query": user question} ,分别表示输入的图像和对应的用户问题。
    Python Dict
    device与实例化时的参数相同。strNone
    (3)对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为`json`文件的操作: @@ -343,19 +332,19 @@ for res in output: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -363,19 +352,19 @@ for res in output: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -415,7 +404,7 @@ for res in output: 如果产线可以达到您对产线推理速度和精度的要求,您可以直接进行开发集成/部署。 -若您需要将产线直接应用在您的Python项目中,可以参考 [2.2 Python脚本方式](#22-python脚本方式集成)中的示例代码。 +若您需要将产线直接应用在您的Python项目中,可以参考 [2.2 Python脚本方式](#22-python脚本方式集成) 中的示例代码。 此外,PaddleOCR 也提供了其他两种部署方式,详细说明如下: diff --git a/docs/version3.x/pipeline_usage/formula_recognition.en.md b/docs/version3.x/pipeline_usage/formula_recognition.en.md index fe5f20ba4681fda9b5bb9f4c537ab5529ae66526..d91f5f3f6e096bd727722f1d39410db0a9c65231 100644 --- a/docs/version3.x/pipeline_usage/formula_recognition.en.md +++ b/docs/version3.x/pipeline_usage/formula_recognition.en.md @@ -2,7 +2,7 @@ comments: true --- -# Formula Recognition Pipeline Tutorial +# Formula Recognition Pipeline Usage Tutorial ## 1. Introduction to Formula Recognition Pipeline @@ -255,7 +255,7 @@ In this pipeline, you can choose the model you want to use based on the benchmar
    -Formula Recognition Module : +Formula Recognition Module : @@ -331,7 +331,7 @@ In this pipeline, you can choose the model you want to use based on the benchmar
    • Performance Test Environment
        -
      • Test Dataset: +
      • Test Dataset:
        • Document Image Orientation Classification Module: A self-built dataset using PaddleOCR, covering multiple scenarios such as ID cards and documents, containing 1000 images.
        • @@ -341,7 +341,7 @@ In this pipeline, you can choose the model you want to use based on the benchmar
        • Formula Recognition Module: A self-built formula recognition test set using PaddleX.
      • -
      • Hardware Configuration: +
      • Hardware Configuration:
        • GPU: NVIDIA Tesla T4
        • CPU: Intel Xeon Gold 6271C @ 2.60GHz
        • @@ -388,7 +388,7 @@ Before using the formula recognition pipeline locally, please ensure that you ha ### 2.1 Command Line Experience -You can quickly experience the effect of the formula recognition pipeline with one command: +You can quickly experience the effect of the formula recognition pipeline with one command. Before running the code below, please download the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png) locally: ```bash paddleocr formula_recognition_pipeline -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png @@ -416,172 +416,156 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png
    - - + +Specify the path to save the inference results file. If not set, the inference results will not be saved locally. - + +The name of the document orientation classification model. If not set, the default model in pipeline will be used. - + - + - + - - + - + - + - - + - + - + - + - + +Whether to load and use the text image unwarping module. If not set, the parameter will default to the value initialized in the pipeline, which is True. - + +The name of the layout detection model. If not set, the default model in pipeline will be used. - + - - + - - - + + - + - - - + + - + - + - + +Whether to load and use the layout detection module. If not set, the parameter will default to the value initialized in the pipeline, which is True. - + - + - - + - + - + - - + @@ -621,10 +604,10 @@ The name of the formula recognition model. If set to None, the defa - - + @@ -637,7 +620,7 @@ The number of threads to use when performing inference on the CPU. - +
    ModelModel Download Link
    inputData to be predicted, supporting multiple input types, required. -
      -
    • Python Var: Image data represented by numpy.ndarray
    • -
    • str: Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path)
    • -
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
    • -
    +
    Data to be predicted, required. +Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path). Python Var|str|liststr
    save_path -Specify the path to save the inference results file. If set to None, the inference results will not be saved locally.strNone
    doc_orientation_classify_model_name -The name of the document orientation classification model. If set to None, the default model in pipeline will be used.strNone
    doc_orientation_classify_model_dirThe directory path of the document orientation classification model. If set to None, the official model will be downloaded.The directory path of the document orientation classification model. If not set, the official model will be downloaded. strNone
    doc_orientation_classify_batch_sizeThe batch size of the document orientation classification model. If set to None, the default batch size will be set to 1. +The batch size of the document orientation classification model. If not set, the default batch size will be set to 1. intNone
    doc_unwarping_model_name The name of the text image unwarping model. If set to None, the default model in pipeline will be used. The name of the text image unwarping model. If not set, the default model in pipeline will be used. strNone
    doc_unwarping_model_dir The directory path of the text image unwarping model. If set to None, the official model will be downloaded. + The directory path of the text image unwarping model. If not set, the official model will be downloaded. strNone
    doc_unwarping_batch_sizeThe batch size of the text image unwarping model. If set to None, the default batch size will be set to 1.The batch size of the text image unwarping model. If not set, the default batch size will be set to 1. intNone
    use_doc_orientation_classifyWhether to load the document orientation classification module. If set to None, the parameter will default to the value initialized in the pipeline, which is True.Whether to load and use the document orientation classification module. If not set, the parameter will default to the value initialized in the pipeline, which is True. boolNone
    use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the parameter will default to the value initialized in the pipeline, which is True.boolNone
    layout_detection_model_name -The name of the layout detection model. If set to None, the default model in pipeline will be used. strNone
    layout_detection_model_dir The directory path of the layout detection model. If set to None, the official model will be downloaded. + The directory path of the layout detection model. If not set, the official model will be downloaded. strNone
    layout_thresholdThreshold for layout detection, used to filter out predictions with low confidence. -
      -
    • float, such as 0.2, indicates filtering out all bounding boxes with a confidence score less than 0.2.
    • -
    • Dictionary, with int keys representing cls_id and float values as thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} indicates applying a threshold of 0.45 for class ID 0, 0.48 for class ID 2, and 0.4 for class ID 7
    • -
    • None, If not specified, the default PaddleX official model configuration will be used
    • -
    +
    Score threshold for the layout model. Any value between 0-1. If not set, the default value is used, which is 0.5. float|dictNonefloat
    layout_nms -Whether to use NMS (Non-Maximum Suppression) post-processing for layout region detection to filter out overlapping boxes. If set to None, the default configuration of the official model will be used. +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If not set, the parameter will default to the value initialized in the pipeline, which is set to True by default. boolNone
    layout_unclip_ratio -The scaling factor for the side length of the detection boxes in layout region detection. -
      -
    • float: A positive float number, e.g., 1.1, indicating that the center of the bounding box remains unchanged while the width and height are both scaled up by a factor of 1.1
    • -
    • List: e.g., [1.2, 1.5], indicating that the center of the bounding box remains unchanged while the width is scaled up by a factor of 1.2 and the height by a factor of 1.5
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • -
    +
    Unclip ratio for detected boxes in layout detection model. Any float > 0. If not set, the default is 1.0. float|listNonefloat
    layout_merge_bboxes_mode The merging mode for the detection boxes output by the model in layout region detection.
      -
    • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed.
    • -
    • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed.
    • -
    • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained.
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • -
    +
  • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed;
  • +
  • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed;
  • +
  • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained;
  • +If not set, the default is large.
    strNone
    layout_detection_batch_sizeThe batch size for the layout region detection model. If set to None, the default batch size will be set to 1.The batch size for the layout region detection model. If not set, the default batch size will be set to 1. intNone
    use_layout_detection -Whether to load the layout detection module. If set to None, the parameter will default to the value initialized in the pipeline, which is True.boolNone
    formula_recognition_model_name -The name of the formula recognition model. If set to None, the default model from the pipeline will be used. +The name of the formula recognition model. If not set, the default model from the pipeline will be used. strNone
    formula_recognition_model_dirThe directory path of the formula recognition model. If set to None, the official model will be downloaded. +The directory path of the formula recognition model. If not set, the official model will be downloaded. strNone
    formula_recognition_batch_sizeThe batch size for the formula recognition model. If set to None, the batch size will default to 1.The batch size for the formula recognition model. If not set, the batch size will default to 1. intNone
    deviceThe device used for inference. You can specify a particular card number. +The device used for inference. You can specify a particular card number:
    • CPU: e.g., cpu indicates using CPU for inference;
    • GPU: e.g., gpu:0 indicates using the 1st GPU for inference;
    • @@ -589,11 +573,10 @@ The name of the formula recognition model. If set to None, the defa
    • XPU: e.g., xpu:0 indicates using the 1st XPU for inference;
    • MLU: e.g., mlu:0 indicates using the 1st MLU for inference;
    • DCU: e.g., dcu:0 indicates using the 1st DCU for inference;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used. During initialization, the local GPU 0 will be prioritized; if unavailable, the CPU will be used.
    • -
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    strNone
    enable_hpi
    enable_mkldnnWhether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. boolNoneTrue
    cpu_threadspaddlex_config Path to PaddleX pipeline configuration file. strNone
    @@ -736,13 +719,13 @@ In the above Python script, the following steps are executed: use_doc_orientation_classify -Whether to load the document orientation classification module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. +Whether to load and use the document orientation classification module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. bool None use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. +Whether to load and use the text image unwarping module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. bool None @@ -762,9 +745,9 @@ In the above Python script, the following steps are executed: layout_threshold Threshold for layout detection, used to filter out predictions with low confidence.
      -
    • float, such as 0.2, indicates filtering out all bounding boxes with a confidence score less than 0.2.
    • -
    • Dictionary, with int keys representing cls_id and float values as thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} indicates applying a threshold of 0.45 for class ID 0, 0.48 for class ID 2, and 0.4 for class ID 7
    • -
    • None, If not specified, the default PaddleX official model configuration will be used
    • +
    • float: Such as 0.2, indicates filtering out all bounding boxes with a confidence score less than 0.2;
    • +
    • Dictionary: With int keys representing cls_id and float values as thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} indicates applying a threshold of 0.45 for class ID 0, 0.48 for class ID 2, and 0.4 for class ID 7;
    • +
    • None: If set to None, the default is 0.5.
    float|dict @@ -772,33 +755,33 @@ In the above Python script, the following steps are executed: layout_nms -Whether to use NMS (Non-Maximum Suppression) post-processing for layout region detection to filter out overlapping boxes. If set to None, the default configuration of the official model will be used. +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If set to None, the parameter will default to the value initialized in the pipeline, which is set to True by default. bool None layout_unclip_ratio -The scaling factor for the side length of the detection boxes in layout region detection. +Expansion factor for the detection boxes of the layout region detection model.
      -
    • float: A positive float number, e.g., 1.1, indicating that the center of the bounding box remains unchanged while the width and height are both scaled up by a factor of 1.1
    • -
    • List: e.g., [1.2, 1.5], indicating that the center of the bounding box remains unchanged while the width is scaled up by a factor of 1.2 and the height by a factor of 1.5
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • +
    • float: Any float greater than 0;
    • +
    • Tuple[float,float]: Expansion ratios in horizontal and vertical directions;
    • +
    • dict: A dictionary with int keys representing cls_id, and tuple values, e.g., {0: (1.1, 2.0)} means width is expanded 1.1× and height 2.0× for class 0 boxes;
    • +
    • None: If set to None, uses the pipeline default of 1.0.
    -float|list +float|Tuple[float,float]|dict None layout_merge_bboxes_mode -The merging mode for the detection boxes output by the model in layout region detection. +Filtering method for overlapping boxes in layout detection.
      -
    • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed.
    • -
    • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed.
    • -
    • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained.
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • +
    • str: Options include large, small, and union to retain the larger box, smaller box, or both;
    • +
    • dict: A dictionary with int keys representing cls_id, and str values, e.g., {0: "large", 2: "small"} means using different modes for different classes;
    • +
    • None: If set to None, uses the pipeline default value large.
    -str +str|dict None @@ -809,7 +792,7 @@ In the above Python script, the following steps are executed: use_layout_detection -Whether to load the layout detection module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. +Whether to load and use the layout detection module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. bool None @@ -833,7 +816,7 @@ In the above Python script, the following steps are executed: device -The device used for inference. You can specify a particular card number. +The device used for inference. You can specify a particular card number:
    • CPU: e.g., cpu indicates using CPU for inference;
    • GPU: e.g., gpu:0 indicates using the 1st GPU for inference;
    • @@ -841,8 +824,8 @@ In the above Python script, the following steps are executed:
    • XPU: e.g., xpu:0 indicates using the 1st XPU for inference;
    • MLU: e.g., mlu:0 indicates using the 1st MLU for inference;
    • DCU: e.g., dcu:0 indicates using the 1st DCU for inference;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used. During initialization, the local GPU 0 will be prioritized; if unavailable, the CPU will be used.
    • -
    +
  • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. + str None @@ -869,14 +852,14 @@ In the above Python script, the following steps are executed: precision Compute precision, such as FP32 or FP16. str -fp32 +"fp32" enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -893,8 +876,7 @@ In the above Python script, the following steps are executed: -(2) -Call the `predict()` method of the formula recognition pipeline object to perform inference prediction. This method will return a list of results. +(2)Call the `predict()` method of the formula recognition pipeline object to perform inference prediction. This method will return a list of results. Additionally, the pipeline also provides the `predict_iter()` method. Both methods are completely consistent in terms of parameter acceptance and result return. The difference is that `predict_iter()` returns a `generator`, which allows for step-by-step processing and retrieval of prediction results. This is suitable for handling large datasets or scenarios where memory saving is desired. You can choose to use either of these methods based on your actual needs. @@ -913,18 +895,13 @@ Here are the parameters of the `predict()` method and their descriptions: input Data to be predicted, supporting multiple input types, required.
      -
    • Python Var: Image data represented by numpy.ndarray
    • -
    • str: Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path)
    • -
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
    • +
    • Python Var: Image data represented by numpy.ndarray;
    • +
    • str: Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path);
    • +
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"].
    Python Var|str|list - -device -The parameters are the same as those used during instantiation. -str -None use_layout_detection @@ -961,7 +938,7 @@ Whether to use the document orientation classification module during inference.< layout_unclip_ratio The parameters are the same as those used during instantiation. -float|list +float|Tuple[float,float]|dict None layout_merge_bboxes_mode @@ -990,19 +967,19 @@ Whether to use the document orientation classification module during inference.< Print results to terminal format_json bool -Whether to format the output content using JSON indentation +Whether to format the output content using JSON indentation. True indent int -Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True +Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters. Effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters. Effective only when format_json is True. False @@ -1010,19 +987,19 @@ Whether to use the document orientation classification module during inference.< Save results as a JSON file save_path str -Path to save the file. If it is a directory, the saved file will be named the same as the input file type +Path to save the file. If it is a directory, the saved file will be named the same as the input file type. 无 indent int -Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True +Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters. Effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False retains the original characters. Effective only when format_json is True. False @@ -1030,7 +1007,7 @@ Whether to use the document orientation classification module during inference.< Save results as an image file save_path str -Path to save the file, supports directory or file path +Path to save the file, supports directory or file path. 无 @@ -1102,8 +1079,8 @@ In addition, PaddleOCR also provides two other deployment methods, which are det 🚀 High-Performance Inference: In real-world production environments, many applications have stringent standards for performance metrics of deployment strategies, particularly regarding response speed, to ensure efficient system operation and a smooth user experience. To address this, PaddleOCR offers high-performance inference capabilities designed to deeply optimize the performance of model inference and pre/post-processing, significantly accelerating the end-to-end process. For detailed information on the high-performance inference process, please refer to the [High-Performance Inference Guide](../deployment/high_performance_inference.en.md). -☁️ Service-Based Deployment: -Service-Based Deployment is a common deployment form in real-world production environments. By encapsulating inference capabilities as a service, clients can access these services via network requests to obtain inference results. For detailed instructions on Service-Based Deployment in production lines, please refer to the [Service-Based Deployment Guide](../deployment/serving.md). +☁️ Service-Based Deployment: +Service-Based Deployment is a common deployment form in real-world production environments. By encapsulating inference capabilities as a service, clients can access these services via network requests to obtain inference results. For detailed instructions on Service-Based Deployment in pipelines, please refer to the [Service-Based Deployment Guide](../deployment/serving.en.md). Below are the API references for basic service-based deployment and multi-language service invocation examples: @@ -1345,6 +1322,8 @@ for i, res in enumerate(result["formulaRecResults"]): If the default model weights provided by the formula recognition pipeline do not meet your requirements in terms of accuracy or speed, you can try to fine-tune the existing models using your own domain-specific or application-specific data to improve the recognition performance of the formula recognition pipeline in your scenario. +### 4.1 Model Fine-Tuning + Since the formula recognition pipeline consists of several modules, if the pipeline's performance is not satisfactory, the issue may arise from any one of these modules. You can analyze the poorly recognized images to determine which module is problematic and refer to the corresponding fine-tuning tutorial links in the table below for model fine-tuning. @@ -1359,17 +1338,17 @@ Since the formula recognition pipeline consists of several modules, if the pipel - + - + - + @@ -1378,3 +1357,122 @@ Since the formula recognition pipeline consists of several modules, if the pipel
    Formulas are missing Layout Detection ModuleLinkLink
    Formula content is inaccurate Formula Recognition ModuleLinkLink
    Whole-image rotation correction is inaccurate Document Image Orientation Classification ModuleLinkLink
    Image distortion correction is inaccurate
    + +### 4.2 Model Deployment + +After you complete fine-tuning training using a private dataset, you can obtain a local model weight file. You can then use the fine-tuned model weights by specifying the local model save path through parameters or by customizing the pipeline configuration file. + +#### 4.2.1 Specify the local model path through parameters + +When initializing the pipeline object, specify the local model path through parameters. Take the usage of the weights after fine-tuning the text detection model as an example, as follows: + +Command line mode: + +```bash +# Specify the local model path via --formula_recognition_model_dir +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --formula_recognition_model_dir your_formula_recognition_model_path + +# PP-FormulaNet_plus-M model is used as the default formula recognition model. If you do not fine-tune this model, modify the model name by using --formula_recognition_model_name +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --formula_recognition_model_name PP-FormulaNet_plus-M --formula_recognition_model_dir your_ppformulanet_plus-m_formula_recognition_model_path +``` + +Script mode: + +```python + +from paddleocr import FormulaRecognitionPipeline + +# Specify the local model path via formula_recognition_model_dir +pipeline = FormulaRecognitionPipeline(formula_recognition_model_dir="./your_formula_recognition_model_path") +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## Print the structured output of the prediction + res.save_to_img(save_path="output") ## Save the formula visualization result of the current image. + res.save_to_json(save_path="output") ## Save the structured JSON result of the current image + +# PP-FormulaNet_plus-M model is used as the default formula recognition model. If you do not fine-tune this model, modify the model name by using formula_recognition_model_name +# pipeline = FormulaRecognitionPipeline(formula_recognition_model_name="PP-FormulaNet_plus-M", formula_recognition_model_dir="./your_ppformulanet_plus-m_formula_recognition_model_path") + +``` + + +#### 4.2.2 Specify the local model path through the configuration file + + +1.Obtain the pipeline configuration file + +Call the `export_paddlex_config_to_yaml` method of the **Formula Recognition Pipeline** object in PaddleOCR to export the current pipeline configuration as a YAML file: + +```Python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline() +pipeline.export_paddlex_config_to_yaml("FormulaRecognitionPipeline.yaml") +``` + +2.Modify the Configuration File + +After obtaining the default pipeline configuration file, replace the paths of the default model weights with the local paths of your fine-tuned model weights. For example: + +```yaml +...... +SubModules: + FormulaRecognition: + batch_size: 5 + model_dir: null # Replace with the path to your fine-tuned formula recognition model weights + model_name: PP-FormulaNet_plus-M # If the name of the fine-tuned model is different from the default model name, please modify it here as well + module_name: formula_recognition + LayoutDetection: + batch_size: 1 + layout_merge_bboxes_mode: large + layout_nms: true + layout_unclip_ratio: 1.0 + model_dir: null # Replace with the path to your fine-tuned layout detection model weights + model_name: PP-DocLayout_plus-L # If the name of the fine-tuned model is different from the default model name, please modify it here as well + module_name: layout_detection + threshold: 0.5 +SubPipelines: + DocPreprocessor: + SubModules: + DocOrientationClassify: + batch_size: 1 + model_dir: null # Replace with the path to your fine-tuned document image orientation classification model weights + model_name: PP-LCNet_x1_0_doc_ori # If the name of the fine-tuned model is different from the default model name, please modify it here as well + module_name: doc_text_orientation + DocUnwarping: + batch_size: 1 + model_dir: null + model_name: UVDoc + module_name: image_unwarping + pipeline_name: doc_preprocessor + use_doc_orientation_classify: true + use_doc_unwarping: true +pipeline_name: formula_recognition +use_doc_preprocessor: true +use_layout_detection: true +...... +``` + +The pipeline configuration file includes not only the parameters supported by the PaddleOCR CLI and Python API but also advanced configurations. For detailed instructions, refer to the [PaddleX Pipeline Usage Overview](https://paddlepaddle.github.io/PaddleX/3.0/en/pipeline_usage/pipeline_develop_guide.html) and adjust the configurations as needed. + +3.Load the Configuration File in CLI + +After modifying the configuration file, specify its path using the `--paddlex_config` parameter in the command line. PaddleOCR will read the file and apply the configurations. Example: + +```bash +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --paddlex_config FormulaRecognitionPipeline.yaml +``` +4.Load the Configuration File in Python API + +When initializing the pipeline object, pass the path of the PaddleX pipeline configuration file or a configuration dictionary via the `paddlex_config` parameter. PaddleOCR will read and apply the configurations. Example: + +```python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline(paddlex_config="FormulaRecognitionPipeline.yaml") +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## Print the structured output of the prediction + res.save_to_img(save_path="output") ## Save the formula visualization result of the current image. + res.save_to_json(save_path="output") ## Save the structured JSON result of the current image +``` diff --git a/docs/version3.x/pipeline_usage/formula_recognition.md b/docs/version3.x/pipeline_usage/formula_recognition.md index 33e110dd4a73f8be7d6d7b75f5e24f2e320360ba..53813cb1c814574c4d30dd3cb93355ee6d907c46 100644 --- a/docs/version3.x/pipeline_usage/formula_recognition.md +++ b/docs/version3.x/pipeline_usage/formula_recognition.md @@ -388,7 +388,7 @@ comments: true ### 2.1 命令行方式体验 -一行命令即可快速体验 formula_recognition 产线效果: +一行命令即可快速体验 formula_recognition 产线效果。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png)到本地: ```bash paddleocr formula_recognition_pipeline -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png @@ -416,158 +416,145 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png input -待预测数据,支持多种输入类型,必填。 -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    +待预测数据,必填。 +如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_orientation_classify_batch_size -文档方向分类模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +文档方向分类模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_batch_size -文本图像矫正模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +文本图像矫正模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_detection_model_name -版面区域检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面区域检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + layout_detection_model_dir -版面区域检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面区域检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + layout_threshold 版面区域检测的阈值,用于过滤掉低置信度预测结果的阈值。 -
      -
    • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • -
    • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为2的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • -
    • None, 不指定,将默认使用默认值
    • -
    +如 0.2,表示过滤掉所有阈值小于0.2的目标框。如果不设置,将默认使用默认值。 -float|dict -None +float + layout_nms -版面区域检测是否使用NMS后处理,过滤重叠框。如果设置为None, 将会使用官方模型配置。 +版面检测是否使用后处理NMS。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_unclip_ratio 版面区域检测中检测框的边长缩放倍数。 -
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • None, 不指定,将使用默认值:1.0
    • -
    +大于0的浮点数,如 1.1 ,表示将模型输出的检测框中心不变,宽和高都扩张1.1倍。如果不设置,将使用默认值:1.0。 -float|list -None +float + layout_merge_bboxes_mode 版面区域检测中模型输出的检测框的合并处理模式。
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • None, 不指定,将使用默认值:“large”
    • -
    +
  • large,设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框;
  • +
  • small,设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框;
  • +
  • union,不进行框的过滤处理,内外框都保留
  • +如果不设置,将使用默认值:“large”; str -None + layout_detection_batch_size -版面区域检测模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +版面区域检测模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_layout_detection -是否加载版面区域检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面区域检测模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + formula_recognition_model_name -公式识别模型的名称。如果设置为None, 将会使用产线默认模型。 +公式识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + formula_recognition_model_dir -公式识别模型的目录路径。如果设置为None, 将会下载官方模型。 +公式识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + formula_recognition_batch_size -公式识别模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +公式识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -575,11 +562,11 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 + str -None + enable_hpi @@ -607,10 +594,10 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -622,7 +609,7 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png paddlex_config PaddleX产线配置文件路径。 str -None + @@ -684,61 +671,61 @@ for res in output: doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_orientation_classify_batch_size -文档方向分类模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +文档方向分类模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_unwarping_batch_size -文本图像矫正模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +文本图像矫正模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None layout_detection_model_name -版面区域检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面区域检测模型的名称。如果设置为None,将会使用产线默认模型。 str None layout_detection_model_dir -版面区域检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面区域检测模型的目录路径。如果设置为None,将会下载官方模型。 str None @@ -746,78 +733,77 @@ for res in output: layout_threshold 版面区域检测的阈值,用于过滤掉低置信度预测结果的阈值。
      -
    • float,如 0.2, 表示过滤掉所有阈值小于0.2的目标框
    • -
    • 字典,字典的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45, 2: 0.48, 7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为2的类别应用阈值0.48、cls_id为7的类别应用阈值0.4
    • -
    • None, 不指定,将使用默认值:0.5
    • -
    +
  • float:如 0.2,表示过滤掉所有阈值小于0.2的目标框;
  • +
  • dict:dict的key为int类型,代表cls_id,val为float类型阈值。如 {0: 0.45,2: 0.48,7: 0.4},表示对cls_id为0的类别应用阈值0.45、cls_id为2的类别应用阈值0.48、cls_id为7的类别应用阈值0.4;
  • +
  • None:不指定,将使用默认值:0.5。
  • float|dict None layout_nms -版面区域检测是否使用NMS后处理,过滤重叠框。如果设置为None, 将会使用官方模型配置。 +版面检测是否使用后处理NMS。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool None layout_unclip_ratio -版面区域检测中检测框的边长缩放倍数。 +版面区域检测模型检测框的扩张系数。
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • None, 不指定,将使用默认值:1.0
    • +
    • float:任意大于 0 浮点数;
    • +
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • +
    • dict,dict的key为int类型,代表cls_id,value为tuple类型,如{0: (1.1,2.0)},表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 1.0
    -float|list +float|Tuple[float,float]|dict None layout_merge_bboxes_mode -版面区域检测中模型输出的检测框的合并处理模式。 +版面区域检测的重叠框过滤方式。
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • None, 不指定,将使用默认值:“large”
    • +
    • strlargesmallunion,分别表示重叠框过滤时选择保留大框,小框还是同时保留;
    • +
    • dict: dict的key为int类型,代表cls_id,value为str类型,如{0: "large", 2: "small"},表示对第0类别检测框使用large模式,对第2类别检测框使用small模式;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 large
    -str +str|dict None layout_detection_batch_size -版面区域检测模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +版面区域检测模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None use_layout_detection -是否加载版面区域检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面区域检测模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None formula_recognition_model_name -公式识别模型的名称。如果设置为None, 将会使用产线默认模型。 +公式识别模型的名称。如果设置为None,将会使用产线默认模型。 str None formula_recognition_model_dir -公式识别模型的目录路径。如果设置为None, 将会下载官方模型。 +公式识别模型的目录路径。如果设置为None,将会下载官方模型。 str None formula_recognition_batch_size -公式识别模型的批处理大小。如果设置为 None, 将默认设置批处理大小为1。 +公式识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -825,7 +811,7 @@ for res in output:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -853,14 +839,14 @@ for res in output: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -896,18 +882,13 @@ for res in output: input 待预测数据,支持多种输入类型,必填
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]。
    Python Var|str|list - -device -与实例化时的参数相同。 -str -None use_layout_detection @@ -942,12 +923,12 @@ for res in output: layout_unclip_ratio 与实例化时的参数相同。 -float|list +float|Tuple[float,float]|dict None layout_merge_bboxes_mode 与实例化时的参数相同。 -string +str|dict None @@ -971,19 +952,19 @@ for res in output: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -991,19 +972,19 @@ for res in output: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1011,7 +992,7 @@ for res in output: 将结果保存为图像格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1068,7 +1049,7 @@ for res in output: - `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `preprocessed_img`、 `layout_det_res`和 `formula_res_img`,对应的值是三个 `Image.Image` 对象:第一个用于展示图像预处理的可视化图像,第二个用于展示版面区域检测的可视化图像,第三个用于展示公式识别的可视化图像。如果没有使用图像预处理子模块,则字典中不包含 `preprocessed_img`;如果没有使用版面区域检测子模块,则字典中不包含`layout_det_res`。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `preprocessed_img`、 `layout_det_res`和 `formula_res_img`,对应的值是三个 `Image.Image` 对象:第一个用于展示图像预处理的可视化图像,第二个用于展示版面区域检测的可视化图像,第三个用于展示公式识别的可视化图像。如果没有使用图像预处理子模块,则dict中不包含 `preprocessed_img`;如果没有使用版面区域检测子模块,则dict中不包含`layout_det_res`。 ## 3. 开发集成/部署 @@ -1202,31 +1183,31 @@ for res in output: useLayoutDetection boolean | null -请参阅产线对象中 predict 方法的 use_layout_detection 参数相关说明。 +请参阅产线对象中 predict方法的 use_layout_detection 参数相关说明。 否 layoutThreshold number | null -请参阅产线对象中 predict 方法的 layout_threshold 参数相关说明。 +请参阅产线对象中predict 法的layout_threshold 参数相关说明。 否 layoutNms boolean | null -请参阅产线对象中 predict 方法的 layout_nms 参数相关说明。 +请参阅产线对象中predict 法的layout_nms 参数相关说明。 否 layoutUnclipRatio number | array | null -请参阅产线对象中 predict 方法的 layout_unclip_ratio 参数相关说明。 +请参阅产线对象中 predict方法的 layout_unclip_ratio 参数相关说明。 否 layoutMergeBboxesMode string | null -请参阅产线对象中 predict 方法的 layout_merge_bboxes_mode 参数相关说明。 +请参阅产线对象中predict方法的 layout_merge_bboxes_mode 参数相关说明。 否 @@ -1268,12 +1249,12 @@ for res in output: prunedResult object -产线对象的 predict 方法生成结果的 JSON 表示中 res 字段的简化版本,其中去除了 input_pathpage_index 字段。 +产线对象的 predict 方法生成结果的JSON表示中res字段的简化版本,其中去除了input_pathpage_index字段。 outputImages object | null -参见产线预测结果的 img 属性说明。图像为JPEG格式,使用Base64编码。 +参见产线预测结果的img属性说明。图像为JPEG格式,使用Base64编码。 inputImage | null @@ -1319,6 +1300,7 @@ for i, res in enumerate(result["formulaRecResults"]): 如果公式识别产线提供的默认模型权重在您的场景中,精度或速度不满意,您可以尝试利用您自己拥有的特定领域或应用场景的数据对现有模型进行进一步的微调,以提升公式识别产线的在您的场景中的识别效果。 +### 4.1 模型微调 由于公式识别产线包含若干模块,模型产线的效果如果不及预期,可能来自于其中任何一个模块。您可以对识别效果差的图片进行分析,进而确定是哪个模块存在问题,并参考以下表格中对应的微调教程链接进行模型微调。 @@ -1334,17 +1316,17 @@ for i, res in enumerate(result["formulaRecResults"]): 公式存在漏检 版面区域检测模块 -链接 +链接 公式内容不准 公式识别模块 -链接 +链接 整图旋转矫正不准 文档图像方向分类模块 -链接 +链接 图像扭曲矫正不准 @@ -1353,3 +1335,122 @@ for i, res in enumerate(result["formulaRecResults"]): + +### 4.2 模型应用 + +当您使用私有数据集完成微调训练后,可获得本地模型权重文件,然后可以通过参数指定本地模型保存路径的方式,或者通过自定义产线配置文件的方式,使用微调后的模型权重。 + +#### 4.2.1 通过参数指定本地模型路径 + +在初始化产线对象时,通过参数指定本地模型路径。以公式识别模型微调后的权重的使用方法为例,示例如下: + +命令行方式: + +```bash +# 通过 --formula_recognition_model_dir 指定本地模型路径 +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --formula_recognition_model_dir your_formula_recognition_model_path + +# 默认使用 PP-FormulaNet_plus-M 模型作为默认公式识别模型,如果微调的不是该模型,通过 --formula_recognition_model_name 修改模型名称 +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --formula_recognition_model_name PP-FormulaNet_plus-M --formula_recognition_model_dir your_ppformulanet_plus-m_formula_recognition_model_path +``` + +脚本方式: + +```python + +from paddleocr import FormulaRecognitionPipeline + +# 通过 formula_recognition_model_dir 指定本地模型路径 +pipeline = FormulaRecognitionPipeline(formula_recognition_model_dir="./your_formula_recognition_model_path") +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img(save_path="output") ## 保存当前图像的公式可视化结果 + res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 +# 默认使用 PP-FormulaNet_plus-M 模型作为默认公式识别模型,如果微调的不是该模型,通过 formula_recognition_model_name 修改模型名称 +# pipeline = FormulaRecognitionPipeline(formula_recognition_model_name="PP-FormulaNet_plus-M", formula_recognition_model_dir="./your_ppformulanet_plus-m_formula_recognition_model_path") + +``` + + +#### 4.2.2 通过配置文件指定本地模型路径 + + +1.获取产线配置文件 + +可调用 PaddleOCR 中 公式识别 产线对象的 `export_paddlex_config_to_yaml` 方法,将当前产线配置导出为 YAML 文件: + +```Python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline() +pipeline.export_paddlex_config_to_yaml("FormulaRecognitionPipeline.yaml") +``` + +2.修改配置文件 + +在得到默认的产线配置文件后,将微调后模型权重的本地路径替换至产线配置文件中的对应位置即可。例如 + +```yaml +...... +SubModules: + FormulaRecognition: + batch_size: 5 + model_dir: null # 替换为微调后的公式模型权重路径 + model_name: PP-FormulaNet_plus-M # 如果微调的模型名称与默认模型名称不同,请一并修改此处 + module_name: formula_recognition + LayoutDetection: + batch_size: 1 + layout_merge_bboxes_mode: large + layout_nms: true + layout_unclip_ratio: 1.0 + model_dir: null # 替换为微调后的版面区域检测模型权重路径 + model_name: PP-DocLayout_plus-L # 如果微调的模型名称与默认模型名称不同,请一并修改此处 + module_name: layout_detection + threshold: 0.5 +SubPipelines: + DocPreprocessor: + SubModules: + DocOrientationClassify: + batch_size: 1 + model_dir: null # 替换为微调后的文档图像方向分类模型权重路径 + model_name: PP-LCNet_x1_0_doc_ori # 如果微调的模型名称与默认模型名称不同,请一并修改此处 + module_name: doc_text_orientation + DocUnwarping: + batch_size: 1 + model_dir: null + model_name: UVDoc + module_name: image_unwarping + pipeline_name: doc_preprocessor + use_doc_orientation_classify: true + use_doc_unwarping: true +pipeline_name: formula_recognition +use_doc_preprocessor: true +use_layout_detection: true +...... +``` + +在产线配置文件中,不仅包含 PaddleOCR CLI 和 Python API 支持的参数,还可进行更多高级配置,具体信息可在 [PaddleX模型产线使用概览](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_usage/pipeline_develop_guide.html) 中找到对应的产线使用教程,参考其中的详细说明,根据需求调整各项配置。 + +3.在 CLI 中加载产线配置文件 + +在修改完成配置文件后,通过命令行的 --paddlex_config 参数指定修改后的产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```bash +paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --paddlex_config FormulaRecognitionPipeline.yaml +``` + +4.在 Python API 中加载产线配置文件 + +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline(paddlex_config="FormulaRecognitionPipeline.yaml") +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img(save_path="output") ## 保存当前图像的公式可视化结果 + res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 +``` diff --git a/docs/version3.x/pipeline_usage/instructions/parallel_inference.en.md b/docs/version3.x/pipeline_usage/instructions/parallel_inference.en.md new file mode 100644 index 0000000000000000000000000000000000000000..205db03d75b346c70d9260da430580a933e91162 --- /dev/null +++ b/docs/version3.x/pipeline_usage/instructions/parallel_inference.en.md @@ -0,0 +1,216 @@ +# Parallel Inference in Pipeline + +## Specifying Multiple Inference Devices + +For some pipelines, both the CLI and Python API of PaddleOCR support specifying multiple inference devices simultaneously. If multiple devices are specified, during pipeline initialization, an instance of the underlying pipeline class will be created on each device, and the received inputs will be processed using parallel inference. For example, for the document image preprocessing pipeline: + +```bash +paddleocr doc_preprocessor \ + --input input_images/ \ + --device 'gpu:0,1,2,3' \ + --use_doc_orientation_classify True \ + --use_doc_unwarping True + --save_path ./output \ + +``` + +```python +from paddleocr import DocPreprocessor + + +pipeline = DocPreprocessor(device="gpu:0,1,2,3") +output = pipeline.predict( + input="input_images/", + use_doc_orientation_classify=True, + use_doc_unwarping=True) + +``` + +Both examples above use 4 GPUs (numbered 0, 1, 2, 3) to perform parallel inference on the `doc_test_rotated.jpg` image. + +When specifying multiple devices, the inference interface remains consistent with that of single-device usage. Please refer to the production line usage tutorial to check whether a specific production line supports multiple inference devices. + +## Example of Multi-Process Parallel Inference + +Beyond PaddleOCR's built-in multi-device parallel inference capability, users can also implement parallelism by wrapping PaddleOCR pipeline API calls themselves according to their specific scenario, with a view to achieving a better speedup. Below is an example of using Python multiprocessing to perform multi-GPU, multi-instance parallel processing on files in an input directory. + + + +```python +import argparse +import sys +from multiprocessing import Manager, Process +from pathlib import Path +from queue import Empty + +import paddleocr + + +def load_pipeline(class_name: str, device: str): + if not hasattr(paddleocr, class_name): + raise ValueError(f"Class {class_name} not found in paddleocr module.") + cls = getattr(paddleocr, class_name) + return cls(device=device) + + +def worker(pipeline_class_path, device, task_queue, batch_size, output_dir): + pipeline = load_pipeline(pipeline_class_path, device) + + should_end = False + batch = [] + + while not should_end: + try: + input_path = task_queue.get_nowait() + except Empty: + should_end = True + else: + batch.append(input_path) + + if batch and (len(batch) == batch_size or should_end): + try: + for result in pipeline.predict(batch): + input_path = Path(result["input_path"]) + if result.get("page_index") is not None: + output_path = f"{input_path.stem}_{result['page_index']}.json" + else: + output_path = f"{input_path.stem}.json" + output_path = str(Path(output_dir, output_path)) + result.save_to_json(output_path) + print(f"Processed {repr(str(input_path))}") + except Exception as e: + print( + f"Error processing {batch} on {repr(device)}: {e}", + file=sys.stderr + ) + batch.clear() + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pipeline", + type=str, + required=True, + help="PaddleOCR pipeline, e.g. 'DocPreprocessor'.", + ) + parser.add_argument( + "--input_dir", type=str, required=True, help="Input directory." + ) + parser.add_argument( + "--device", + type=str, + required=True, + help="Specifies the devices for performing parallel inference.", + ) + parser.add_argument( + "--output_dir", type=str, default="output", help="Output directory." + ) + parser.add_argument( + "--instances_per_device", + type=int, + default=1, + help="Number of pipeline instances per device.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="Inference batch size for each pipeline instance.", + ) + parser.add_argument( + "--input_glob_pattern", + type=str, + default="*", + help="Pattern to find the input files.", + ) + args = parser.parse_args() + + input_dir = Path(args.input_dir) + if not input_dir.exists(): + print(f"The input directory does not exist: {input_dir}", file=sys.stderr) + return 2 + if not input_dir.is_dir(): + print(f"{repr(str(input_dir))} is not a directory.", file=sys.stderr) + return 2 + + output_dir = Path(args.output_dir) + if output_dir.exists() and not output_dir.is_dir(): + print(f"{repr(str(output_dir))} is not a directory.", file=sys.stderr) + return 2 + output_dir.mkdir(parents=True, exist_ok=True) + + from paddlex.utils.device import constr_device, parse_device + + device_type, device_ids = parse_device(args.device) + if device_ids is None or len(device_ids) == 1: + print( + "Please specify at least two devices for performing parallel inference.", + file=sys.stderr, + ) + return 2 + + if args.batch_size <= 0: + print("Batch size must be greater than 0.", file=sys.stderr) + return 2 + + with Manager() as manager: + task_queue = manager.Queue() + for img_path in input_dir.glob(args.input_glob_pattern): + task_queue.put(str(img_path)) + + processes = [] + for device_id in device_ids: + for _ in range(args.instances_per_device): + device = constr_device(device_type, [device_id]) + p = Process( + target=worker, + args=( + args.pipeline, + device, + task_queue, + args.batch_size, + str(output_dir), + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join() + + print("All done") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + +``` +Assuming the script is saved as `infer_mp.py`, here are some example commands for running it: +```bash +# This is for the general layout analysis V3 pipeline, corresponding to `PPStructureV3` +# For the exact value of the `--pipeline` parameter, please refer to the **script** import name of the pipeline +# Other pipelines include `SealRecognition` for the seal text pipeline, and `DocUnderstanding` for the document understanding pipeline +# Process all files in the `input_images` directory +# Use GPUs 0, 1, 2, and 3, with 1 pipeline instance per GPU, and each instance processes 1 input file at a time +python infer_mp.py \ + --pipeline PPStructureV3 \ + --input_dir input_images/ \ + --device 'gpu:0,1,2,3' \ + --output_dir output + +# General layout analysis V3 pipeline +# Process all files with the `.jpg` suffix in the `input_images` directory +# Use GPUs 0 and 2, with 2 pipeline instances per GPU, and each instance processes 4 input files at a time +python infer_mp.py \ + --pipeline PPStructureV3 \ + --input_dir input_images/ \ + --device 'gpu:0,2' \ + --output_dir output \ + --instances_per_device 2 \ + --batch_size 4 \ + --input_glob_pattern '*.jpg' + +``` diff --git a/docs/version3.x/pipeline_usage/instructions/parallel_inference.md b/docs/version3.x/pipeline_usage/instructions/parallel_inference.md new file mode 100644 index 0000000000000000000000000000000000000000..aa341b2414c472f480b3c35582db3865cb93356e --- /dev/null +++ b/docs/version3.x/pipeline_usage/instructions/parallel_inference.md @@ -0,0 +1,215 @@ +# 产线并行推理 + +## 指定多个推理设备 + +对于部分产线的 CLI 和 Python API,PaddleOCR 支持同时指定多个推理设备。如果指定了多个设备,产线初始化时将在每个设备上创建一个底层产线类对象的实例,并对接收到的输入进行并行推理。例如,对于文档图像预处理产线: + +```bash +paddleocr doc_preprocessor \ + --input input_images/ \ + --device 'gpu:0,1,2,3' \ + --use_doc_orientation_classify True \ + --use_doc_unwarping True + --save_path ./output \ + +``` + +```python +from paddleocr import DocPreprocessor + + +pipeline = DocPreprocessor(device="gpu:0,1,2,3") +output = pipeline.predict( + input="input_images/", + use_doc_orientation_classify=True, + use_doc_unwarping=True) + +``` + +以上两个例子均使用 4 块 GPU(编号为 0、1、2、3)对 `doc_test_rotated.jpg` 图片进行并行推理。 + +指定多个设备时,推理接口仍然与指定单设备时保持一致。请查看产线使用教程以了解某一产线是否支持指定多个推理设备。 + +## 多进程并行推理示例 + +除了使用 PaddleOCR 内置的多设备并行推理功能外,用户也可以结合实际场景,通过封装 PaddleOCR 产线 API 调用来实现并行处理,从而获得更优的加速效果。如下是使用 Python 多进程实现多卡、多实例并行处理输入目录中的文件的示例代码: + +```python +import argparse +import sys +from multiprocessing import Manager, Process +from pathlib import Path +from queue import Empty + +import paddleocr + + +def load_pipeline(class_name: str, device: str): + if not hasattr(paddleocr, class_name): + raise ValueError(f"Class {class_name} not found in paddleocr module.") + cls = getattr(paddleocr, class_name) + return cls(device=device) + + +def worker(pipeline_class_path, device, task_queue, batch_size, output_dir): + pipeline = load_pipeline(pipeline_class_path, device) + + should_end = False + batch = [] + + while not should_end: + try: + input_path = task_queue.get_nowait() + except Empty: + should_end = True + else: + batch.append(input_path) + + if batch and (len(batch) == batch_size or should_end): + try: + for result in pipeline.predict(batch): + input_path = Path(result["input_path"]) + if result.get("page_index") is not None: + output_path = f"{input_path.stem}_{result['page_index']}.json" + else: + output_path = f"{input_path.stem}.json" + output_path = str(Path(output_dir, output_path)) + result.save_to_json(output_path) + print(f"Processed {repr(str(input_path))}") + except Exception as e: + print( + f"Error processing {batch} on {repr(device)}: {e}", + file=sys.stderr + ) + batch.clear() + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pipeline", + type=str, + required=True, + help="PaddleOCR pipeline, e.g. 'DocPreprocessor'.", + ) + parser.add_argument( + "--input_dir", type=str, required=True, help="Input directory." + ) + parser.add_argument( + "--device", + type=str, + required=True, + help="Specifies the devices for performing parallel inference.", + ) + parser.add_argument( + "--output_dir", type=str, default="output", help="Output directory." + ) + parser.add_argument( + "--instances_per_device", + type=int, + default=1, + help="Number of pipeline instances per device.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="Inference batch size for each pipeline instance.", + ) + parser.add_argument( + "--input_glob_pattern", + type=str, + default="*", + help="Pattern to find the input files.", + ) + args = parser.parse_args() + + input_dir = Path(args.input_dir) + if not input_dir.exists(): + print(f"The input directory does not exist: {input_dir}", file=sys.stderr) + return 2 + if not input_dir.is_dir(): + print(f"{repr(str(input_dir))} is not a directory.", file=sys.stderr) + return 2 + + output_dir = Path(args.output_dir) + if output_dir.exists() and not output_dir.is_dir(): + print(f"{repr(str(output_dir))} is not a directory.", file=sys.stderr) + return 2 + output_dir.mkdir(parents=True, exist_ok=True) + + from paddlex.utils.device import constr_device, parse_device + + device_type, device_ids = parse_device(args.device) + if device_ids is None or len(device_ids) == 1: + print( + "Please specify at least two devices for performing parallel inference.", + file=sys.stderr, + ) + return 2 + + if args.batch_size <= 0: + print("Batch size must be greater than 0.", file=sys.stderr) + return 2 + + with Manager() as manager: + task_queue = manager.Queue() + for img_path in input_dir.glob(args.input_glob_pattern): + task_queue.put(str(img_path)) + + processes = [] + for device_id in device_ids: + for _ in range(args.instances_per_device): + device = constr_device(device_type, [device_id]) + p = Process( + target=worker, + args=( + args.pipeline, + device, + task_queue, + args.batch_size, + str(output_dir), + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join() + + print("All done") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + +``` + +假设将上述脚本存储为 `infer_mp.py`,以下是一些调用示例: + +```bash +# 确定 `--pipeline` 参数需查看其产线 **脚本方式** 导入类名称 +# 此处为通用版面解析 v3 产线,对应PPStructureV3 +# 其余产线例如印章文本产线对应SealRecognition,文档理解产线对应DocUnderstanding +# 处理 `input_images` 目录中所有文件 +# 使用 GPU 0、1、2、3,每块 GPU 上 1 个产线实例,每个实例一次处理 1 个输入文件 +python infer_mp.py \ + --pipeline PPStructureV3 \ + --input_dir input_images/ \ + --device 'gpu:0,1,2,3' \ + --output_dir output + +# 通用版面解析 v3 产线 +# 处理 `input_images` 目录中所有后缀为 `.jpg` 的文件 +# 使用 GPU 0、2,每块 GPU 上 2 个产线实例,每个实例一次处理 4 个输入文件 +python infer_mp.py \ + --pipeline PPStructureV3 \ + --input_dir input_images/ \ + --device 'gpu:0,2' \ + --output_dir output \ + --instances_per_device 2 \ + --batch_size 4 \ + --input_glob_pattern '*.jpg' +``` diff --git a/docs/version3.x/pipeline_usage/pipeline_overview.en.md b/docs/version3.x/pipeline_usage/pipeline_overview.en.md new file mode 100644 index 0000000000000000000000000000000000000000..402b5da5cd75c6decd4ab7a0451950aaafada300 --- /dev/null +++ b/docs/version3.x/pipeline_usage/pipeline_overview.en.md @@ -0,0 +1,3 @@ +# Pipeline Overview + +A pipeline is a practical functional implementation composed of one or more modules. Through reasonable module combination and configuration, pipelines can meet the needs of complex application scenarios, such as technological applications like Optical Character Recognition (OCR). Pipelines not only demonstrate the integrated application of basic modules but also support capabilities such as high-performance inference and service-oriented deployment, providing users with higher development efficiency and broader application possibilities. diff --git a/docs/version3.x/pipeline_usage/pipeline_overview.md b/docs/version3.x/pipeline_usage/pipeline_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..71d797d8ff2905053df208dfc9bc3573b8ee9ce6 --- /dev/null +++ b/docs/version3.x/pipeline_usage/pipeline_overview.md @@ -0,0 +1,3 @@ +# 产线概述 + +产线是由一个或多个模块组合而成的实际功能实现。通过合理的模块组合与配置,产线能够满足复杂应用场景的需求,比如光学字符识别(OCR)等技术应用。产线不仅体现了基础模块的整合应用,还支持高性能推理和服务化部署等能力,赋予用户更高的开发效率和更广泛的应用可能性。 diff --git a/docs/version3.x/pipeline_usage/seal_recognition.en.md b/docs/version3.x/pipeline_usage/seal_recognition.en.md index 3c4bb590a84fb0336d049b636be318058eaafeef..203154a6301747f3063a9069c351177fefbe0b86 100644 --- a/docs/version3.x/pipeline_usage/seal_recognition.en.md +++ b/docs/version3.x/pipeline_usage/seal_recognition.en.md @@ -2,7 +2,7 @@ comments: true --- -# Seal Text Recognition Pipeline Tutorial +# Seal Text Recognition Pipeline Usage Tutorial ## 1. Introduction to Seal Text Recognition Pipeline Seal text recognition is a technology that automatically extracts and recognizes the content of seals from documents or images. The recognition of seal text is part of document processing and has many applications in various scenarios, such as contract comparison, warehouse entry and exit review, and invoice reimbursement review. @@ -19,20 +19,21 @@ The seal text recognition pipeline is used to recognize the text content of seal - [Document Image Orientation Classification Module](../module_usage/doc_img_orientation_classification.en.md) (Optional) - [Text Image Unwarping Module](../module_usage/text_image_unwarping.en.md) (Optional) -If you prioritize model accuracy, choose a model with higher accuracy. If you prioritize inference speed, choose a model with faster inference speed. If you prioritize model storage size, choose a model with smaller storage size. +In this pipeline, you can choose the model to use based on the benchmark data below. -

    Layout Region Detection Module (Optional):

    +
    + Layout Region Detection Module (Optional): -* The layout detection model includes 20 common categories: document title, paragraph title, text, page number, abstract, table, references, footnotes, header, footer, algorithm, formula, formula number, image, table, seal, figure_table title, chart, and sidebar text and lists of references +* Layout detection model, including 20 common categories: document title, paragraph title, text, page number, abstract, table of contents, references, footnotes, header, footer, algorithm, formula, formula number, image, table, figure and table title (figure title, table title, and chart title), seal, chart, sidebar text, and reference content - - + + - + @@ -42,7 +43,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -54,10 +55,10 @@ The seal text recognition pipeline is used to recognize the text content of seal - - + + - + @@ -67,7 +68,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -75,7 +76,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -83,26 +84,25 @@ The seal text recognition pipeline is used to recognize the text content of seal - +
    ModelModel Download Link mAP(0.5) (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    34.6244 / 10.3945 510.57 / - 126.01 MA higher-precision layout area localization model trained on a self-built dataset containing Chinese and English papers, PPT, multi-layout magazines, contracts, books, exams, ancient books and research reports using RT-DETR-LA higher precision layout region localization model based on RT-DETR-L trained on a self-built dataset including Chinese and English papers, multi-column magazines, newspapers, PPTs, contracts, books, exam papers, research reports, ancient books, Japanese documents, and vertical text documents
    ModelModel Download Link mAP(0.5) (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    34.6244 / 10.3945 510.57 / - 123.76 MA high-precision layout area localization model trained on a self-built dataset containing Chinese and English papers, magazines, contracts, books, exams, and research reports using RT-DETR-L.A high precision layout region localization model based on RT-DETR-L trained on a self-built dataset including Chinese and English papers, magazines, contracts, books, exam papers, and research reports
    PP-DocLayout-MInference Model/Training Model13.3259 / 4.8685 44.0680 / 44.0680 22.578A layout area localization model with balanced precision and efficiency, trained on a self-built dataset containing Chinese and English papers, magazines, contracts, books, exams, and research reports using PicoDet-L.A balanced model of accuracy and efficiency based on PicoDet-L trained on a self-built dataset including Chinese and English papers, magazines, contracts, books, exam papers, and research reports
    PP-DocLayout-SInference Model/Training Model8.3008 / 2.3794 10.0623 / 9.9296 4.834A high-efficiency layout area localization model trained on a self-built dataset containing Chinese and English papers, magazines, contracts, books, exams, and research reports using PicoDet-S.A highly efficient layout region localization model based on PicoDet-S trained on a self-built dataset including Chinese and English papers, magazines, contracts, books, exam papers, and research reports
    +>❗ Listed above are the 4 core models that are the focus of the layout detection module, which supports a total of 13 full models, including multiple models with pre-defined different categories, among which 9 models include the seal category. Apart from the 3 core models mentioned above, the remaining models are as follows: -> ❗ The above list includes the 4 core models that are key supported by the text recognition module. The module actually supports a total of 13 full models. It includes multiple predefined models of different categories, among which there are 10 models specifically for the seal category. Apart from the three core models mentioned above, the remaining models are listed as follows: - -
    👉 Details of Model List +
    👉Details of the Model List -* 3-Class Layout Detection Model, including Table, Image, and Stamp +* 3-class layout detection model, including table, image, seal - - + + - + @@ -112,7 +112,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -120,7 +120,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -128,20 +128,20 @@ The seal text recognition pipeline is used to recognize the text content of seal - +
    ModelModel Download Link mAP(0.5) (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    8.99 / 2.22 16.11 / 8.73 4.8A high-efficiency layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using PicoDet-S.A highly efficient layout region localization model based on the lightweight PicoDet-S model trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    PicoDet-L_layout_3clsInference Model/Training Model13.05 / 4.50 41.30 / 41.30 22.6A balanced efficiency and precision layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using PicoDet-L.An efficiency-accuracy balanced layout region localization model based on PicoDet-L trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    RT-DETR-H_layout_3clsInference Model/Training Model114.93 / 27.71 947.56 / 947.56 470.1A high-precision layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using RT-DETR-H.A high precision layout region localization model based on RT-DETR-H trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    -* 17-Class Area Detection Model, including 17 common layout categories: Paragraph Title, Image, Text, Number, Abstract, Content, Figure Caption, Formula, Table, Table Caption, References, Document Title, Footnote, Header, Algorithm, Footer, and Stamp +* 17-class region detection model, including 17 common layout categories: paragraph title, image, text, number, abstract, content, chart title, formula, table, table title, references, document title, footnote, header, algorithm, footer, seal - - + + - + @@ -151,7 +151,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -159,7 +159,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -167,20 +167,22 @@ The seal text recognition pipeline is used to recognize the text content of seal - +
    ModelModel Download Link mAP(0.5) (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    9.11 / 2.12 15.42 / 9.12 4.8A high-efficiency layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using PicoDet-S.A highly efficient layout region localization model based on the lightweight PicoDet-S model trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    PicoDet-L_layout_17clsInference Model/Training Model13.50 / 4.69 43.32 / 43.32 22.6A balanced efficiency and precision layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using PicoDet-L.An efficiency-accuracy balanced layout region localization model based on PicoDet-L trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    RT-DETR-H_layout_17clsInference Model/Training Model115.29 / 104.09 995.27 / 995.27 470.2A high-precision layout area localization model trained on a self-built dataset of Chinese and English papers, magazines, and research reports using RT-DETR-H.A high precision layout region localization model based on RT-DETR-H trained on a self-built dataset including Chinese and English papers, magazines, and research reports
    +
    -

    Document Image Orientation Classification Module (Optional):

    +
    + Document Image Orientation Classification Module (Optional): - - + + @@ -196,12 +198,15 @@ The seal text recognition pipeline is used to recognize the text content of seal
    ModelModel Download Link Top-1 Acc (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M) Description
    -

    Text Image Correction Module (Optional):

    +
    + +
    + Text Image Correction Module (Optional): - + @@ -211,19 +216,21 @@ The seal text recognition pipeline is used to recognize the text content of seal - +
    ModelModel Download LinkCERCER Model Storage Size (M) Description
    UVDocInference Model/Training Model 0.179 30.3 MHigh-precision text image correction modelA high precision text image correction model
    +
    -

    Text Detection Module:

    +
    + Seal Text Detection Module: - - + + @@ -231,7 +238,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -239,7 +246,7 @@ The seal text recognition pipeline is used to recognize the text content of seal - + @@ -247,30 +254,31 @@ The seal text recognition pipeline is used to recognize the text content of seal
    ModelModel Download Link Detection Hmean (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M) Description
    PP-OCRv4_server_seal_detInference Model/Training Model98.2198.40 74.75 / 67.72 382.55 / 382.55 109
    PP-OCRv4_mobile_seal_detInference Model/Training Model96.4796.36 7.82 / 3.09 48.28 / 23.97 4.6
    +
    -

    Text Recognition Module:

    - +
    +Text Recognition Module: - + - - + + - + +PP-OCRv5_server_rec_infer.tar">Inference Model/Training Model - + +PP-OCRv5_mobile_rec_infer.tar">Inference Model/Training Model @@ -278,73 +286,73 @@ PP-OCRv5_mobile_rec_infer.tar">Inference Model/Inference Model/Pretrained Model +PP-OCRv4_server_rec_doc_infer.tar">Inference Model/Training Model - - + + - + - - + + - + - - + + +en_PP-OCRv4_mobile_rec_infer.tar">Inference Model/Training Model - - + +
    ModelModel Download LinksModelModel Download Link Recognition Avg Accuracy(%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    PP-OCRv5_server_recInference Model/Pretrained Model 86.38 8.45/2.36 122.69/122.69 81 MPP-OCRv5_rec is a next-generation text recognition model. It aims to efficiently and accurately support the recognition of four major languages—Simplified Chinese, Traditional Chinese, English, and Japanese—as well as complex text scenarios such as handwriting, vertical text, pinyin, and rare characters using a single model. While maintaining recognition performance, it balances inference speed and model robustness, providing efficient and accurate technical support for document understanding in various scenarios.PP-OCRv5_rec is a new generation text recognition model. This model aims to efficiently and accurately support the recognition of four major languages: Simplified Chinese, Traditional Chinese, English, and Japanese, as well as complex text scenes like handwriting, vertical text, pinyin, and rare characters with a single model. It balances recognition effectiveness, inference speed, and model robustness, providing efficient and accurate technical support for document understanding in various scenarios.
    PP-OCRv5_mobile_recInference Model/Pretrained Model 81.29 1.46/5.43 5.32/91.79 86.58 6.65 / 2.38 32.92 / 32.9291 MPP-OCRv4_server_rec_doc is trained on a mixed dataset of more Chinese document data and PP-OCR training data, building upon PP-OCRv4_server_rec. It enhances the recognition capabilities for some Traditional Chinese characters, Japanese characters, and special symbols, supporting over 15,000 characters. In addition to improving document-related text recognition, it also enhances general text recognition capabilities.181 MPP-OCRv4_server_rec_doc is trained on a mix of more Chinese document data and PP-OCR training data based on PP-OCRv4_server_rec, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters, supporting over 15,000+ characters. Besides improving document-related text recognition, it also enhances general text recognition capabilities
    PP-OCRv4_mobile_recInference Model/Pretrained ModelPP-OCRv4_mobile_recInference Model/Training Model 83.28 4.82 / 1.20 16.74 / 4.6411 MA lightweight recognition model of PP-OCRv4 with high inference efficiency, suitable for deployment on various hardware devices, including edge devices.88 MPP-OCRv4 lightweight recognition model, with high inference efficiency, can be deployed on multiple hardware devices, including edge devices
    PP-OCRv4_server_rec Inference Model/Pretrained ModelPP-OCRv4_server_rec Inference Model/Training Model 85.19 6.58 / 2.43 33.17 / 33.1787 MThe server-side model of PP-OCRv4, offering high inference accuracy and deployable on various servers.151 MPP-OCRv4 server-side model, with high inference accuracy, can be deployed on various servers
    en_PP-OCRv4_mobile_recInference Model/Pretrained Model 70.39 4.81 / 0.75 16.10 / 5.317.3 MAn ultra-lightweight English recognition model trained based on the PP-OCRv4 recognition model, supporting English and numeric character recognition.66 MAn ultra-lightweight English recognition model trained based on the PP-OCRv4 recognition model, supporting English and number recognition
    -> ❗ The above section lists the **6 core models** that are primarily supported by the text recognition module. In total, the module supports **20 comprehensive models**, including multiple multilingual text recognition models. Below is the complete list of models: +> ❗ Listed above are the 6 core models that are the focus of the text recognition module, which supports a total of 20 full models, including multiple multi-language text recognition models, with the complete model list as follows:
    👉Details of the Model List -* PP-OCRv5 Multi-Scenario Models +* PP-OCRv5 Multi-Scene Model - - - - - - - + + + + + + + - + +PP-OCRv5_server_rec_infer.tar">Inference Model/Training Model - - + + - + +PP-OCRv5_mobile_rec_infer.tar">Inference Model/Training Model @@ -360,42 +368,44 @@ PP-OCRv5_mobile_rec_infer.tar">Inference Model/Inference Model/Training Model - + + - - + + - + - - + + - + - - + + - - + + - - + +
    ModelModel Download LinksAvg Accuracy for Chinese Recognition (%)Avg Accuracy for English Recognition (%)Avg Accuracy for Traditional Chinese Recognition (%)Avg Accuracy for Japanese Recognition (%)GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode]
    ModelModel Download LinkChinese Recognition Avg Accuracy(%)English Recognition Avg Accuracy(%)Traditional Chinese Recognition Avg Accuracy(%)Japanese Recognition Avg Accuracy(%)GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode]
    Model Storage Size (M)IntroductionDescription
    PP-OCRv5_server_recInference Model/Pretrained Model 86.38 64.70 93.29 60.35 8.45/2.36 122.69/122.69 1.46/5.43 5.32/91.79 81 MPP-OCRv5_rec is a next-generation text recognition model. It aims to efficiently and accurately support the recognition of four major languages—Simplified Chinese, Traditional Chinese, English, and Japanese—as well as complex text scenarios such as handwriting, vertical text, pinyin, and rare characters using a single model. While maintaining recognition performance, it balances inference speed and model robustness, providing efficient and accurate technical support for document understanding in various scenarios.PP-OCRv5_rec is a new generation text recognition model. This model aims to efficiently and accurately support the recognition of four major languages: Simplified Chinese, Traditional Chinese, English, and Japanese, as well as complex text scenes like handwriting, vertical text, pinyin, and rare characters with a single model. It balances recognition effectiveness, inference speed, and model robustness, providing efficient and accurate technical support for document understanding in various scenarios.
    PP-OCRv5_mobile_recInference Model/Pretrained Model 81.29 66.00 83.5581.53PP-OCRv4_server_rec_docInference Model/Training Model86.58 6.65 / 2.38 32.92 / 32.9274.7 MPP-OCRv4_server_rec_doc is trained on a mixed dataset of more Chinese document data and PP-OCR training data based on PP-OCRv4_server_rec. It has added the recognition capabilities for some traditional Chinese characters, Japanese, and special characters. The number of recognizable characters is over 15,000. In addition to the improvement in document-related text recognition, it also enhances the general text recognition capability.91 MPP-OCRv4_server_rec_doc is trained on a mix of more Chinese document data and PP-OCR training data based on PP-OCRv4_server_rec, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters, supporting over 15,000+ characters. Besides improving document-related text recognition, it also enhances general text recognition capabilities
    PP-OCRv4_mobile_recInference Model/Training Model78.7483.28 4.82 / 1.20 16.74 / 4.6410.6 MThe lightweight recognition model of PP-OCRv4 has high inference efficiency and can be deployed on various hardware devices, including edge devices.11 MPP-OCRv4 lightweight recognition model, with high inference efficiency, can be deployed on multiple hardware devices, including edge devices
    PP-OCRv4_server_rec Inference Model/Training Model80.61 85.19 6.58 / 2.43 33.17 / 33.1771.2 MThe server-side model of PP-OCRv4 offers high inference accuracy and can be deployed on various types of servers.87 MPP-OCRv4 server-side model, with high inference accuracy, can be deployed on various servers
    PP-OCRv3_mobile_recInference Model/Training Model72.96PP-OCRv3_mobile_recInference Model/Training Model75.43 5.87 / 1.19 9.07 / 4.289.2 MPP-OCRv3’s lightweight recognition model is designed for high inference efficiency and can be deployed on a variety of hardware devices, including edge devices.11 MPP-OCRv3 lightweight recognition model, with high inference efficiency, can be deployed on multiple hardware devices, including edge devices
    @@ -403,10 +413,10 @@ PP-OCRv5_mobile_rec_infer.tar">Inference Model/Inference Model/Training Model @@ -415,7 +425,7 @@ PP-OCRv5_mobile_rec_infer.tar">Inference Model/ -SVTRv2 is a server text recognition model developed by the OpenOCR team of Fudan University's Visual and Learning Laboratory (FVL). It won the first prize in the PaddleOCR Algorithm Model Challenge - Task One: OCR End-to-End Recognition Task. The end-to-end recognition accuracy on the A list is 6% higher than that of PP-OCRv4. +SVTRv2 is a server-side text recognition model developed by the OpenOCR team from Fudan University's Visual and Learning Lab (FVL), which won first place in the PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition, improving the end-to-end recognition accuracy on the A leaderboard by 6% compared to PP-OCRv4. @@ -424,10 +434,10 @@ SVTRv2 is a server text recognition model developed by the OpenOCR team of Fudan ModelModel Download Link Recognition Avg Accuracy(%) -GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] -CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] +GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] +CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] Model Storage Size (M) -Introduction +Description ch_RepSVTR_rec
    Inference Model/Training Model @@ -435,7 +445,7 @@ SVTRv2 is a server text recognition model developed by the OpenOCR team of Fudan 5.93 / 1.62 20.73 / 7.32 22.1 M - The RepSVTR text recognition model is a mobile text recognition model based on SVTRv2. It won the first prize in the PaddleOCR Algorithm Model Challenge - Task One: OCR End-to-End Recognition Task. The end-to-end recognition accuracy on the B list is 2.5% higher than that of PP-OCRv4, with the same inference speed. +RepSVTR text recognition model is a mobile-side text recognition model based on SVTRv2, which won first place in the PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition, improving the end-to-end recognition accuracy on the B leaderboard by 2.5% compared to PP-OCRv4, with comparable inference speed. @@ -444,26 +454,28 @@ SVTRv2 is a server text recognition model developed by the OpenOCR team of Fudan ModelModel Download Link Recognition Avg Accuracy(%) -GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] -CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] +GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] +CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] Model Storage Size (M) -Introduction +Description -en_PP-OCRv4_mobile_recInference Model/Training Model +en_PP-OCRv4_mobile_recInference Model/Training Model 70.39 4.81 / 0.75 16.10 / 5.31 6.8 M -The ultra-lightweight English recognition model trained based on the PP-OCRv4 recognition model supports the recognition of English and numbers. +An ultra-lightweight English recognition model trained based on the PP-OCRv4 recognition model, supporting English and number recognition -en_PP-OCRv3_mobile_recInference Model/Training Model +en_PP-OCRv3_mobile_recInference Model/Training Model 70.69 5.44 / 0.75 8.65 / 5.57 7.8 M -The ultra-lightweight English recognition model trained based on the PP-OCRv3 recognition model supports the recognition of English and numbers. +An ultra-lightweight English recognition model trained based on the PP-OCRv3 recognition model, supporting English and number recognition @@ -472,122 +484,134 @@ SVTRv2 is a server text recognition model developed by the OpenOCR team of Fudan ModelModel Download Link Recognition Avg Accuracy(%) -GPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] -CPU Inference Time (ms)
    [Normal Mode / High-Performance Mode] +GPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] +CPU Inference Time (ms)
    [Regular Mode / High-Performance Mode] Model Storage Size (M) -Introduction +Description -korean_PP-OCRv3_mobile_recInference Model/Training Model +korean_PP-OCRv3_mobile_recInference Model/Training Model 60.21 5.40 / 0.97 9.11 / 4.05 8.6 M -The ultra-lightweight Korean recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Korean and numbers. +An ultra-lightweight Korean recognition model trained based on the PP-OCRv3 recognition model, supporting Korean and number recognition -japan_PP-OCRv3_mobile_recInference Model/Training Model +japan_PP-OCRv3_mobile_recInference Model/Training Model 45.69 5.70 / 1.02 8.48 / 4.07 8.8 M -The ultra-lightweight Japanese recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Japanese and numbers. +An ultra-lightweight Japanese recognition model trained based on the PP-OCRv3 recognition model, supporting Japanese and number recognition -chinese_cht_PP-OCRv3_mobile_recInference Model/Training Model +chinese_cht_PP-OCRv3_mobile_recInference Model/Training Model 82.06 5.90 / 1.28 9.28 / 4.34 9.7 M -The ultra-lightweight Traditional Chinese recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Traditional Chinese and numbers. +An ultra-lightweight Traditional Chinese recognition model trained based on the PP-OCRv3 recognition model, supporting Traditional Chinese and number recognition -te_PP-OCRv3_mobile_recInference Model/Training Model +te_PP-OCRv3_mobile_recInference Model/Training Model 95.88 5.42 / 0.82 8.10 / 6.91 7.8 M -The ultra-lightweight Telugu recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Telugu and numbers. +An ultra-lightweight Telugu recognition model trained based on the PP-OCRv3 recognition model, supporting Telugu and number recognition -ka_PP-OCRv3_mobile_recInference Model/Training Model +ka_PP-OCRv3_mobile_recInference Model/Training Model 96.96 5.25 / 0.79 9.09 / 3.86 8.0 M -The ultra-lightweight Kannada recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Kannada and numbers. +An ultra-lightweight Kannada recognition model trained based on the PP-OCRv3 recognition model, supporting Kannada and number recognition -ta_PP-OCRv3_mobile_recInference Model/Training Model +ta_PP-OCRv3_mobile_recInference Model/Training Model 76.83 5.23 / 0.75 10.13 / 4.30 8.0 M -The ultra-lightweight Tamil recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Tamil and numbers. +An ultra-lightweight Tamil recognition model trained based on the PP-OCRv3 recognition model, supporting Tamil and number recognition -latin_PP-OCRv3_mobile_recInference Model/Training Model +latin_PP-OCRv3_mobile_recInference Model/Training Model 76.93 5.20 / 0.79 8.83 / 7.15 7.8 M -The ultra-lightweight Latin recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Latin script and numbers. +An ultra-lightweight Latin recognition model trained based on the PP-OCRv3 recognition model, supporting Latin and number recognition -arabic_PP-OCRv3_mobile_recInference Model/Training Model +arabic_PP-OCRv3_mobile_recInference Model/Training Model 73.55 5.35 / 0.79 8.80 / 4.56 7.8 M -The ultra-lightweight Arabic script recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Arabic script and numbers. +An ultra-lightweight Arabic letter recognition model trained based on the PP-OCRv3 recognition model, supporting Arabic letters and number recognition -cyrillic_PP-OCRv3_mobile_recInference Model/Training Model +cyrillic_PP-OCRv3_mobile_recInference Model/Training Model 94.28 5.23 / 0.76 8.89 / 3.88 7.9 M - -The ultra-lightweight cyrillic alphabet recognition model trained based on the PP-OCRv3 recognition model supports the recognition of cyrillic letters and numbers. +An ultra-lightweight Cyrillic letter recognition model trained based on the PP-OCRv3 recognition model, supporting Cyrillic letters and number recognition -devanagari_PP-OCRv3_mobile_recInference Model/Training Model +devanagari_PP-OCRv3_mobile_recInference Model/Training Model 96.44 5.22 / 0.79 8.56 / 4.06 -7.9 M -The ultra-lightweight Devanagari script recognition model trained based on the PP-OCRv3 recognition model supports the recognition of Devanagari script and numbers. +7.9 M +An ultra-lightweight Devanagari letter recognition model trained based on the PP-OCRv3 recognition model, supporting Devanagari letters and number recognition
    +
    -Test Environment Description: +
    + Test Environment Description:
    • Performance Test Environment
        -
      • Test Dataset: -
          -
        • Document Image Orientation Classification Module: A self-built dataset using PaddleOCR, covering multiple scenarios such as ID cards and documents, containing 1000 images.
        • -
        • Text Image Rectification Model: DocUNet
        • -
        • Layout Detection Model: A self-built layout detection dataset using PaddleOCR, including 500 images of common document types such as Chinese and English papers, magazines, contracts, books, exam papers, and research reports.
        • -
        • 3-Category Layout Detection Model: A self-built layout detection dataset using PaddleOCR, containing 1154 images of common document types such as Chinese and English papers, magazines, and research reports.
        • -
        • 17-Category Region Detection Model: A self-built layout detection dataset using PaddleOCR, including 892 images of common document types such as Chinese and English papers, magazines, and research reports.
        • -
        • Text Detection Model: A self-built Chinese dataset using PaddleOCR, covering multiple scenarios such as street scenes, web images, documents, and handwriting, with 500 images for detection.
        • -
        • Chinese Recognition Model: A self-built Chinese dataset using PaddleOCR, covering multiple scenarios such as street scenes, web images, documents, and handwriting, with 11,000 images for text recognition.
        • -
        • ch_SVTRv2_rec: Evaluation set A for "OCR End-to-End Recognition Task" in the PaddleOCR Algorithm Model Challenge
        • -
        • ch_RepSVTR_rec: Evaluation set B for "OCR End-to-End Recognition Task" in the PaddleOCR Algorithm Model Challenge.
        • -
        • English Recognition Model: A self-built English dataset using PaddleX.
        • -
        • Multilingual Recognition Model: A self-built multilingual dataset using PaddleX.
        • -
        • Text Line Orientation Classification Model: A self-built dataset using PaddleOCR, covering various scenarios such as ID cards and documents, containing 1000 images.
        • -
        • Seal Text Detection Model: A self-built dataset using PaddleOCR, containing 500 images of circular seal textures.
        • -
        -
      • -
      • Hardware Configuration: +
      • Test Dataset: + +
          +
        • Document Image Orientation Classification Model: Self-built internal dataset covering multiple scenarios such as documents and certificates, containing 1000 images.
        • +
        • Text Image Correction Model: DocUNet.
        • +
        • Layout Region Detection Model: PaddleOCR self-built layout region detection dataset, containing 500 common document type images such as Chinese and English papers, magazines, contracts, books, exam papers, and research reports.
        • +
        • 3-Class Layout Detection Model: PaddleOCR self-built layout region detection dataset, containing 1154 common document type images such as Chinese and English papers, magazines, and research reports.
        • +
        • 17-Class Region Detection Model: PaddleOCR self-built layout region detection dataset, containing 892 common document type images such as Chinese and English papers, magazines, and research reports.
        • +
        • Text Detection Model: PaddleOCR self-built Chinese dataset covering multiple scenarios such as street scenes, web images, documents, and handwriting, where detection includes 500 images.
        • +
        • Chinese Recognition Model: PaddleOCR self-built Chinese dataset covering multiple scenarios such as street scenes, web images, documents, and handwriting, where text recognition includes 11,000 images.
        • +
        • ch_SVTRv2_rec: PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition A leaderboard evaluation set.
        • +
        • ch_RepSVTR_rec: PaddleOCR Algorithm Model Challenge - Task 1: OCR End-to-End Recognition B leaderboard evaluation set.
        • +
        • English Recognition Model: Self-built internal English dataset.
        • +
        • Multilingual Recognition Model: Self-built internal multilingual dataset.
        • +
        • Text Line Orientation Classification Model: Self-built internal dataset covering multiple scenarios such as documents and certificates, containing 1000 images.
        • +
        • Seal Text Detection Model: Self-built internal dataset containing 500 circular seal images.
        • +
        +
      • +
      • Hardware Configuration:
        • GPU: NVIDIA Tesla T4
        • CPU: Intel Xeon Gold 6271C @ 2.60GHz
        • -
        • Other Environments: Ubuntu 20.04 / cuDNN 8.6 / TensorRT 8.5.2.2
        • +
        • Other Environment: Ubuntu 20.04 / cuDNN 8.6 / TensorRT 8.5.2.2
      @@ -599,36 +623,39 @@ The ultra-lightweight cyrillic alphabet recognition model trained based on the P Mode - GPU Configuration - CPU Configuration + GPU Configuration + CPU Configuration Acceleration Technology Combination - Normal Mode + Regular Mode FP32 Precision / No TRT Acceleration FP32 Precision / 8 Threads PaddleInference High-Performance Mode - Optimal combination of pre-selected precision types and acceleration strategies + Optimal combination of prior precision type and acceleration strategy FP32 Precision / 8 Threads - Pre-selected optimal backend (Paddle/OpenVINO/TRT, etc.) + Select optimal prior backend (Paddle/OpenVINO/TRT, etc.) +
    +
    +If you are more concerned with model accuracy, please choose a model with higher accuracy. If you are more concerned with inference speed, please choose a model with faster inference speed. If you are more concerned with model storage size, please choose a model with smaller storage size. ## 2. Quick Start -Before using the seal text recognition production line locally, please ensure that you have completed the installation of the wheel package according to the [installation tutorial](../installation.md). Once the installation is complete, you can experience it locally via the command line or integrate it with Python. +Before using the seal text recognition pipeline locally, please ensure that you have completed the installation of the wheel package according to the [installation tutorial](../installation.md). Once the installation is complete, you can experience it locally via the command line or integrate it with Python. ### 2.1 Command Line Experience -You can quickly experience the seal_recognition production line effect with a single command: +You can quickly experience the seal_recognition pipeline effect with a single command: ```bash paddleocr seal_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/seal_text_det.png \ @@ -652,232 +679,191 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu input -Data to be predicted, supporting multiple input types, required. -
      -
    • Python Var: Image data represented by numpy.ndarray
    • -
    • str: Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path)
    • -
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
    • -
    +Data to be predicted, required. +Local path of image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., network URL of image or PDF file: Example; Local directory, the directory should contain images to be predicted, e.g., local path: /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with a specific file path). -Python Var|str|list +str save_path -Specify the path to save the inference results file. If set to None, the inference results will not be saved locally. +Specify the path to save the inference results file. If not set, the inference results will not be saved locally. str -None + doc_orientation_classify_model_name -The name of the document orientation classification model. If set to None, the default model in pipeline will be used. +The name of the document orientation classification model. If not set, the default model in pipeline will be used. str -None + doc_orientation_classify_model_dir -The directory path of the document orientation classification model. If set to None, the official model will be downloaded. +The directory path of the document orientation classification model. If not set, the official model will be downloaded. str -None + doc_unwarping_model_name - The name of the text image unwarping model. If set to None, the default model in pipeline will be used. + The name of the text image unwarping model. If not set, the default model in pipeline will be used. str -None + doc_unwarping_model_dir - The directory path of the text image unwarping model. If set to None, the official model will be downloaded. + The directory path of the text image unwarping model. If not set, the official model will be downloaded. str -None + layout_detection_model_name -The name of the layout detection model. If set to None, the default model in pipeline will be used. +The name of the layout detection model. If not set, the default model in pipeline will be used. str -None + layout_detection_model_dir - The directory path of the layout detection model. If set to None, the official model will be downloaded. + The directory path of the layout detection model. If not set, the official model will be downloaded. str -None + seal_text_detection_model_name -The name of the seal text detection model. If set to None, the production line's default model will be used. +The name of the seal text detection model. If not set, the pipeline's default model will be used. str -None + seal_text_detection_model_dir -The directory path of the seal text detection model. If set to None, the official model will be downloaded. +The directory path of the seal text detection model. If not set, the official model will be downloaded. str -None + text_recognition_model_name -Name of the text recognition model. If None, the default pipeline model is used. +Name of the text recognition model. If not set, the default pipeline model is used. str -None + text_recognition_model_dir -Directory path of the text recognition model. If None, the official model is downloaded. +Directory path of the text recognition model. If not set, the official model is downloaded. str -None + text_recognition_batch_size -Batch size for the text recognition model. If None, defaults to 1. +Batch size for the text recognition model. If not set, defaults to 1. int -None + use_doc_orientation_classify -Whether to enable document orientation classification. If None, defaults to pipeline initialization value (True). +Whether to load and use document orientation classification module. If not set, defaults to pipeline initialization value (True). bool -None + use_doc_unwarping -Whether to enable text image correction. If None, defaults to pipeline initialization value (True). +Whether to load and use text image correction module. If not set, defaults to pipeline initialization value (True). bool -None + use_layout_detection -Whether to load the layout detection module. If set to None, the parameter will default to the value initialized in the pipeline, which is True. +Whether to load and use the layout detection module. If not set, the parameter will default to the value initialized in the pipeline, which is True. bool -None + layout_threshold -Threshold for layout detection, used to filter out predictions with low confidence. -
      -
    • float, such as 0.2, indicates filtering out all bounding boxes with a confidence score less than 0.2.
    • -
    • Dictionary, with int keys representing cls_id and float values as thresholds. For example, {0: 0.45, 2: 0.48, 7: 0.4} indicates applying a threshold of 0.45 for class ID 0, 0.48 for class ID 2, and 0.4 for class ID 7
    • -
    • None, If not specified, the default PaddleX official model configuration will be used
    • -
    +Score threshold for the layout model. Any value between 0-1. If not set, the default value is used, which is 0.5. -float|dict -None +float + layout_nms -Whether to use NMS (Non-Maximum Suppression) post-processing for layout region detection to filter out overlapping boxes. If set to None, the default configuration of the official model will be used. +Whether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If not set, the parameter will default to the value initialized in the pipeline, which is set to True by default. bool -None + layout_unclip_ratio -The scaling factor for the side length of the detection boxes in layout region detection. -
      -
    • float: A positive float number, e.g., 1.1, indicating that the center of the bounding box remains unchanged while the width and height are both scaled up by a factor of 1.1
    • -
    • List: e.g., [1.2, 1.5], indicating that the center of the bounding box remains unchanged while the width is scaled up by a factor of 1.2 and the height by a factor of 1.5
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • -
    +Unclip ratio for detected boxes in layout detection model. Any float > 0. If not set, the default is 1.0. -float|list -None +float + layout_merge_bboxes_mode The merging mode for the detection boxes output by the model in layout region detection.
      -
    • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed.
    • -
    • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed.
    • -
    • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained.
    • -
    • None: If not specified, the default PaddleX official model configuration will be used
    • -
    +
  • large: When set to "large", only the largest outer bounding box will be retained for overlapping bounding boxes, and the inner overlapping boxes will be removed;
  • +
  • small: When set to "small", only the smallest inner bounding boxes will be retained for overlapping bounding boxes, and the outer overlapping boxes will be removed;
  • +
  • union: No filtering of bounding boxes will be performed, and both inner and outer boxes will be retained;
  • +If not set, the default is large. str -None + seal_det_limit_side_len -The side length limit for seal detection images. -int|None - -
      -
    • int: Any integer greater than 0;
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to 960;
    • -
    +Image side length limit for seal text detection. +Any integer > 0. If not set, the default is 736. -None +int + seal_det_limit_type -The type of side length limit for seal detection images. -str|None - -
      -
    • str: Supports min and max, where min ensures that the shortest side of the image is not less than det_limit_side_len, and max ensures that the longest side of the image is not greater than limit_side_len.
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to max;
    • -
    +Limit type for image side in seal text detection. +Supports min and max; min ensures shortest side ≥ det_limit_side_len, max ensures longest side ≤ limit_side_len. If not set, the default is min. -None +str + seal_det_thresh -The pixel threshold for detection. In the output probability map, pixel points with scores greater than this threshold will be considered as seal pixels. -float|None - -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.3.
    • -
    +Pixel threshold. Pixels with scores above this value in the probability map are considered text. +any float > 0. If not set, the default is 0.2. -None +float + seal_det_box_thresh -The bounding box threshold for detection. When the average score of all pixel points within the detection result bounding box is greater than this threshold, the result will be considered as a seal region. -float|None - -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.6.
    • -
    +Box threshold. Boxes with average pixel scores above this value are considered text regions. +any float > 0. If not set, the default is 0.6. -None +float + seal_det_unclip_ratio -The expansion coefficient for seal detection. This method is used to expand the seal region, and the larger the value, the larger the expansion area. -float|None - -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to 2.0.
    • -
    - +Expansion ratio for seal text detection. Higher value means larger expansion area. +Any float > 0. If not set, the default is 0.5. -None +float + seal_rec_score_thresh -The seal recognition threshold. Text results with scores greater than this threshold will be retained. -float|None - -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, it will default to the value initialized by the pipeline, initialized to 0.0. I.e., no threshold is set.
    • -
    - +Recognition score threshold. Text results above this value will be kept. +Any float > 0. If not set, the default is 0.0 (no threshold). -None +float + device -The device used for inference. Support for specifying specific card numbers. +The device used for inference. Support for specifying specific card numbers:
    • CPU: For example, cpu indicates using the CPU for inference.
    • GPU: For example, gpu:0 indicates using the first GPU for inference.
    • @@ -885,11 +871,10 @@ Whether to load the layout detection module. If set to None, the pa
    • XPU: For example, xpu:0 indicates using the first XPU for inference.
    • MLU: For example, mlu:0 indicates using the first MLU for inference.
    • DCU: For example, dcu:0 indicates using the first DCU for inference.
    • -
    • None: If set to None, the parameter value initialized by the pipeline will be used by default. During initialization, the local GPU 0 device will be prioritized; if not available, the CPU device will be used.
    • -
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. str -None + enable_hpi @@ -917,9 +902,9 @@ Whether to load the layout detection module. If set to None, the pa enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -931,7 +916,7 @@ Whether to load the layout detection module. If set to None, the pa paddlex_config Path to PaddleX pipeline configuration file. str -None + @@ -942,10 +927,8 @@ Whether to load the layout detection module. If set to None, the pa After running, the results will be printed to the terminal, as follows: -
    👉Click to Expand - ```bash -{'res': {'input_path': 'seal_text_det.png', 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 16, 'label': 'seal', 'score': 0.975531280040741, 'coordinate': [6.195526, 0.1579895, 634.3982, 628.84595]}]}, 'seal_res_list': [{'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': [array([[320, 38], +{'res': {'input_path': './seal_text_det.png', 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 16, 'label': 'seal', 'score': 0.975529670715332, 'coordinate': [6.191284, 0.16680908, 634.39325, 628.85345]}]}, 'seal_res_list': [{'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': [array([[320, 38], ..., [315, 38]]), array([[461, 347], ..., @@ -953,7 +936,7 @@ After running, the results will be printed to the terminal, as follows: ..., [434, 444]]), array([[158, 468], ..., - [154, 466]])], 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.2, 'box_thresh': 0.6, 'unclip_ratio': 0.5}, 'text_type': 'seal', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0, 'rec_texts': ['天津君和缘商贸有限公司', '发票专用章', '吗繁物', '5263647368706'], 'rec_scores': array([0.9934051 , ..., 0.99139398]), 'rec_polys': [array([[320, 38], + [154, 466]])], 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.2, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 0.5}, 'text_type': 'seal', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0, 'rec_texts': ['天津君和缘商贸有限公司', '发票专用章', '吗繁物', '5263647368706'], 'rec_scores': array([0.99340463, ..., 0.9916274 ]), 'rec_polys': [array([[320, 38], ..., [315, 38]]), array([[461, 347], ..., @@ -963,16 +946,11 @@ After running, the results will be printed to the terminal, as follows: ..., [154, 466]])], 'rec_boxes': array([], dtype=float64)}]}} ``` - -
    - -The explanation of the result parameters can be found in [2.1.2 Python Script Integration](#212-python-script-integration). - The visualized results are saved under `save_path`, and the visualized result of seal OCR is as follows: - + -#### 2.2.2 Python Script Integration +### 2.2 Python Script Integration * The above command line is for quickly experiencing and viewing the effect. Generally, in a project, you often need to integrate through code. You can complete the quick inference of the pipeline with just a few lines of code. The inference code is as follows: @@ -993,7 +971,7 @@ for res in output: In the above Python script, the following steps were executed: -(1) The seal recognition pipeline object was instantiated via `create_pipeline()`, with the specific parameters described as follows: +(1) Instantiate a pipeline object for seal text recognition using the SealRecognition() class, with specific parameter descriptions as follows: @@ -1006,194 +984,363 @@ In the above Python script, the following steps were executed: - - + + - - - + + + - - + + + + + + + + - + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + - -
    pipelineThe name of the pipeline or the path to the pipeline configuration file. If it is a pipeline name, it must be supported by PaddleX.doc_orientation_classify_model_nameName of the document orientation classification model. If set to None, the pipeline default model is used. str None
    configSpecific configuration information for the pipeline (if set simultaneously with pipeline, it has higher priority than pipeline, and the pipeline name must be consistent with pipeline).dict[str, Any]doc_orientation_classify_model_dirDirectory path of the document orientation classification model. If set to None, the official model will be downloaded.str None
    deviceThe device used for pipeline inference. It supports specifying the specific card number of the GPU, such as "gpu:0", other hardware card numbers, such as "npu:0", or CPU, such as "cpu". Supports specifying multiple devices simultaneously for parallel inference. For details, please refer to Pipeline Parallel Inference.doc_unwarping_model_nameName of the document unwarping model. If set to None, the pipeline default model is used.strNone
    doc_unwarping_model_dirDirectory path of the document unwarping model. If set to None, the official model will be downloaded. strgpu:0None
    layout_detection_model_nameName of the layout detection model. If set to None, the pipeline default model is used.strNone
    use_hpipWhether to enable the high-performance inference plugin. If set to None, the setting from the configuration file or config will be used.layout_detection_model_dirDirectory path of the layout detection model. If set to None, the official model will be downloaded.strNone
    seal_text_detection_model_nameName of the seal text detection model. If set to None, the default model will be used.str
    seal_text_detection_model_dirDirectory of the seal text detection model. If set to None, the official model will be downloaded.str
    text_recognition_model_nameName of the text recognition model. If set to None, the pipeline default model is used.strNone
    text_recognition_model_dirDirectory path of the text recognition model. If set to None, the official model will be downloaded.strNone
    text_recognition_batch_sizeBatch size for the text recognition model. If set to None, the default batch size is 1.intNone
    use_doc_orientation_classifyWhether to enable the document orientation classification module. If set to None, the default value is True. boolNone None
    hpi_configHigh-performance inference configurationdict | NoneNoneuse_doc_unwarpingWhether to enable the document image unwarping module. If set to None, the default value is True.bool None
    - -(2) Call the `predict()` method of the Seal Text Recognition pipeline object for inference prediction. This method will return a `generator`. Below are the parameters and their descriptions for the `predict()` method: - - - - - - - - + + + + - - - - - + - + + - - - - + + + + + + + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + - - - - + + + - - - - + - + + - - - - + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ParameterDescriptionTypeOptionsDefault Valueuse_layout_detectionWhether to load and use the layout detection module. If set to None, the parameter will default to the value initialized in the pipeline, which is True.boolNone
    inputData to be predicted, supports multiple input types (required)Python Var|str|list +layout_thresholdScore threshold for the layout model.
      -
    • Python Var: Image data represented by numpy.ndarray
    • -
    • str: Local path of an image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., the network URL of an image or PDF file: Example; Local directory, containing images to be predicted, e.g., /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with an exact file path)
    • -
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"]
    • +
    • float: Any float between 0-1;
    • +
    • dict: {0:0.1} where the key is the class ID and the value is the threshold for that class;
    • +
    • None: If set to None, uses the pipeline default of 0.5.
    float|dictNone
    deviceInference device for the pipelinestr|None +layout_nmsWhether to use Non-Maximum Suppression (NMS) as post-processing for layout detection. If set to None, the parameter will default to the value initialized in the pipeline, which is set to True by default.boolNone
    layout_unclip_ratioExpansion ratio for the bounding boxes from the layout detection model.
      -
    • CPU: e.g., cpu for CPU inference;
    • -
    • GPU: e.g., gpu:0 for inference using the first GPU;
    • -
    • NPU: e.g., npu:0 for inference using the first NPU;
    • -
    • XPU: e.g., xpu:0 for inference using the first XPU;
    • -
    • MLU: e.g., mlu:0 for inference using the first MLU;
    • -
    • DCU: e.g., dcu:0 for inference using the first DCU;
    • -
    • None: If set to None, the default value from the pipeline initialization will be used. During initialization, the local GPU device 0 will be prioritized; if unavailable, the CPU device will be used.
    • +
    • float: Any float greater than 0;
    • +
    • Tuple[float,float]: Expansion ratios in horizontal and vertical directions;
    • +
    • dict: A dictionary with int keys representing cls_id, and tuple values, e.g., {0: (1.1, 2.0)} means width is expanded 1.1× and height 2.0× for class 0 boxes;
    • +
    • None: If set to None, uses the pipeline default of 1.0.
    float|Tuple[float,float]|dict None
    use_doc_orientation_classifyWhether to use the document orientation classification modulebool|None +layout_merge_bboxes_modeFiltering method for overlapping boxes in layout detection.
      -
    • bool: True or False;
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as True.
    • +
    • str: Options include large, small, and union to retain the larger box, smaller box, or both;
    • +
    • dict: A dictionary with int keys representing cls_id, and str values, e.g., {0: "large", 2: "small"} means using different modes for different classes;
    • +
    • None: If set to None, uses the pipeline default value large.
    str|dict None
    use_doc_unwarpingWhether to use the document unwarping modulebool|None +seal_det_limit_side_lenImage side length limit for seal text detection.
      -
    • bool: True or False;
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as True.
    • +
    • int: Any integer greater than 0;
    • +
    • None: If set to None, the default value is 736.
    int None
    use_layout_detectionWhether to use the layout detection modulebool|None +seal_det_limit_typeLimit type for seal text detection image side length.
      -
    • bool: True or False;
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as True.
    • +
    • str: Supports min and max. min ensures the shortest side is no less than det_limit_side_len, while max ensures the longest side is no greater than limit_side_len;
    • +
    • None: If set to None, the default value is min.
    str None
    layout_thresholdConfidence threshold for layout detection; only scores above this threshold will be outputfloat|dict|None +seal_det_threshPixel threshold for detection. Pixels with scores greater than this value in the probability map are considered text pixels.
      -
    • float: Any float greater than 0
    • -
    • dict: Key is the int category ID, value is any float greater than 0
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as 0.5
    • +
    • float: Any float greater than 0;
    • +
    • None: If set to None, the default value is 0.2.
    float None
    layout_nmsWhether to use Non-Maximum Suppression (NMS) for layout detection post-processingbool|None +seal_det_box_threshBounding box threshold. If the average score of all pixels inside a detection box exceeds this threshold, it is considered a text region.
      -
    • bool: True or False;
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as True.
    • +
    • float: Any float greater than 0;
    • +
    • None: If set to None, the default value is 0.6.
    float None
    layout_unclip_ratioExpansion ratio of detection box edges; if not specified, the default value from the PaddleX official model configuration will be usedfloat|list|None +seal_det_unclip_ratioExpansion ratio for seal text detection. The larger the value, the larger the expanded area.
      -
    • float: Any float greater than 0, e.g., 1.1, which means expanding the width and height of the detection box by 1.1 times while keeping the center unchanged
    • -
    • list: e.g., [1.2, 1.5], which means expanding the width of the detection box by 1.2 times and the height by 1.5 times while keeping the center unchanged
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as 1.0
    • +
    • float: Any float greater than 0;
    • +
    • None: If set to None, the default value is 0.5.
    floatNone
    layout_merge_bboxes_modeMerging mode for detection boxes in layout detection output; if not specified, the default value from the PaddleX official model configuration will be usedstring|None +seal_rec_score_threshScore threshold for seal text recognition. Text results with scores above this threshold will be retained.
      -
    • large: When set to large, only the largest external box will be retained for overlapping detection boxes, and the internal overlapping boxes will be removed.
    • -
    • small: When set to small, only the smallest internal box will be retained for overlapping detection boxes, and the external overlapping boxes will be removed.
    • -
    • union: No filtering of boxes will be performed; both internal and external boxes will be retained.
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as large.
    • +
    • float: Any float greater than 0;
    • +
    • None: If set to None, the default value is 0.0 (no threshold).
    NonefloatNone
    seal_det_limit_side_lenSide length limit for seal text detectionint|None +deviceDevice used for inference. Supports specifying device ID:
      -
    • int: Any integer greater than 0
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as 736
    • +
    • CPU: e.g., cpu means using CPU for inference;
    • +
    • GPU: e.g., gpu:0 means using GPU 0;
    • +
    • NPU: e.g., npu:0 means using NPU 0;
    • +
    • XPU: e.g., xpu:0 means using XPU 0;
    • +
    • MLU: e.g., mlu:0 means using MLU 0;
    • +
    • DCU: e.g., dcu:0 means using DCU 0;
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str None
    seal_rec_score_threshText recognition threshold; text results with scores above this threshold will be retainedfloat|None +enable_hpiWhether to enable high-performance inference.boolFalse
    use_tensorrtWhether to use TensorRT for accelerated inference.boolFalse
    min_subgraph_sizeMinimum subgraph size used to optimize model subgraph computation.int3
    precisionComputation precision, e.g., fp32, fp16.str"fp32"
    enable_mkldnnWhether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set.boolTrue
    cpu_threadsNumber of threads used for inference on CPU.int8
    paddlex_configPath to the PaddleX pipeline configuration file.strNone
    + +(2) Call the `predict()` method of the Seal Text Recognition pipeline object for inference prediction. This method will return a `generator`. Below are the parameters and their descriptions for the `predict()` method: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ParameterParameter DescriptionParameter TypeDefault Value
    inputInput data to be predicted. Required. Supports multiple types:
      -
    • float: Any float greater than 0
    • -
    • None: If set to None, the default value from the pipeline initialization will be used, initialized as 0.0. This means no threshold is applied.
    • +
    • Python Var: Image data represented by numpy.ndarray;
    • +
    • str: Local path of an image or PDF file, e.g., /root/data/img.jpg; URL link, e.g., the network URL of an image or PDF file: Example; Local directory, containing images to be predicted, e.g., /root/data/ (currently does not support prediction of PDF files in directories; PDF files must be specified with an exact file path);
    • +
    • List: Elements of the list must be of the above types, e.g., [numpy.ndarray, numpy.ndarray], [\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"], [\"/root/data1\", \"/root/data2\"].
    Python Var|str|list
    use_doc_orientation_classifyWhether to use the document orientation classification module during inference.boolNone
    use_doc_unwarpingWhether to use the text image correction module during inference.boolNone
    use_layout_detection +Whether to use the layout detection module during inference. boolNone
    layout_thresholdSame as the parameter during instantiation.float|dictNone
    layout_nmsSame as the parameter during instantiation.boolNone
    layout_unclip_ratioSame as the parameter during instantiation.float|Tuple[float,float]|dictNone
    layout_merge_bboxes_modeSame as the parameter during instantiation.str|dictNone
    seal_det_limit_side_lenSame as the parameter during instantiation.intNone
    seal_det_limit_typeSame as the parameter during instantiation.strNone
    seal_det_threshSame as the parameter during instantiation.floatNone
    seal_det_box_threshSame as the parameter during instantiation.floatNone
    seal_det_unclip_ratioSame as the parameter during instantiation.floatNone
    seal_rec_score_threshSame as the parameter during instantiation.float None
    +
    (3) Process the prediction results. The prediction result for each sample is of `dict` type and supports operations such as printing, saving as an image, and saving as a `json` file: @@ -1213,19 +1360,19 @@ In the above Python script, the following steps were executed: Print results to the terminal format_json bool -Whether to format the output content using JSON indentation +Whether to format the output content using JSON indentation. True indent int -Specify the indentation level to beautify the output JSON data for better readability, effective only when format_json is True +Specify the indentation level to beautify the output JSON data for better readability, effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True. False @@ -1233,19 +1380,19 @@ In the above Python script, the following steps were executed: Save results as a json file save_path str -The file path to save the results. When it is a directory, the saved file name will be consistent with the input file type +The file path to save the results. When it is a directory, the saved file name will be consistent with the input file type. None indent int -Specify the indentation level to beautify the output JSON data for better readability, effective only when format_json is True +Specify the indentation level to beautify the output JSON data for better readability, effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters, effective only when format_json is True. False @@ -1253,7 +1400,7 @@ In the above Python script, the following steps were executed: Save results as an image file save_path str -The file path to save the results, supports directory or file path +The file path to save the results, supports directory or file path. None @@ -1345,7 +1492,7 @@ If the pipeline meets your requirements for inference speed and accuracy, you ca If you need to integrate the pipeline into your Python project, you can refer to the example code in [2.2 Python Script Method](#22-python脚本方式集成). -In addition, PaddleX also provides three other deployment methods, which are detailed as follows: +In addition, PaddleOCR also provides three other deployment methods, which are detailed as follows: 🚀 High-Performance Inference: In real-world production environments, many applications have stringent performance requirements for deployment strategies, especially in terms of response speed, to ensure efficient system operation and a smooth user experience. To address this, PaddleOCR offers high-performance inference capabilities aimed at deeply optimizing the performance of model inference and pre/post-processing, thereby significantly accelerating the end-to-end process. For detailed high-performance inference procedures, please refer to [High-Performance Inference](../deployment/high_performance_inference.md). @@ -1622,6 +1769,7 @@ for i, res in enumerate(result["sealRecResults"]): ## 4. Custom Development If the default model weights provided by the seal text recognition pipeline do not meet your requirements in terms of accuracy or speed, you can try to fine-tune the existing models using your own domain-specific or application data to improve the recognition performance of the seal text recognition pipeline in your scenario. +### 4.1 Model Fine-Tuning Since the seal text recognition pipeline consists of several modules, if the pipeline's performance does not meet expectations, the issue may arise from any one of these modules. You can analyze images with poor recognition results to identify which module is problematic and refer to the corresponding fine-tuning tutorial links in the table below for model fine-tuning. @@ -1660,3 +1808,92 @@ Since the seal text recognition pipeline consists of several modules, if the pip
    + + +### 4.2 Model Application + +After you complete the fine-tuning training with a private dataset, you can obtain the local model weight files. You can then use the fine-tuned model weights by specifying the local model save path through parameters or by using a custom pipeline configuration file. + +#### 4.2.1 Specify Local Model Path via Parameters + +When initializing the pipeline object, specify the local model path through parameters. Taking the usage of fine-tuned weights for a text detection model as an example, the demonstration is as follows: + +Command line method: + +```bash +# Specify the local model path through --doc_orientation_classify_model_dir +paddleocr seal_recognition -i ./seal_text_det.png --doc_orientation_classify_model_dir your_orientation_classify_model_path + +# By default, the PP-LCNet_x1_0_doc_ori model is used as the default text detection model. If the fine-tuned model is not this one, modify the model name with --text_detection_model_name +paddleocr seal_recognition -i ./seal_text_det.png --doc_orientation_classify_model_name PP-LCNet_x1_0_doc_ori --doc_orientation_classify_model_dir your_orientation_classify_model_path +``` + +Script method: + +```python + +from paddleocr import SealRecognition + +# Specify the local model path through doc_orientation_classify_model_dir +pipeline = SealRecognition(doc_orientation_classify_model_dir ="./your_orientation_classify_model_path") + +# By default, the PP-LCNet_x1_0_doc_ori model is used as the default text detection model. If the fine-tuned model is not this one, modify the model name with doc_orientation_classify_model_name +# pipeline = SealRecognition(doc_orientation_classify_model_name="PP-LCNet_x1_0_doc_ori", doc_orientation_classify_model_dir="./your_orientation_classify_model_path") + +``` + +#### 4.2.2 Specify Local Model Path via Configuration File + +1. Obtain pipeline Configuration File + +You can call the `export_paddlex_config_to_yaml` method of the general OCR pipeline object in PaddleOCR to export the current pipeline configuration to a YAML file: + +```Python +from paddleocr import SealRecognition + +pipeline = SealRecognition() +pipeline.export_paddlex_config_to_yaml("SealRecognition.yaml") +``` + +2. Modify Configuration File + +After obtaining the default pipeline configuration file, replace the local path of the fine-tuned model weights in the corresponding position of the pipeline configuration file. For example: + +```yaml +...... +SubPipelines: + DocPreprocessor: + SubModules: + DocOrientationClassify: + model_dir: null # Replace with the path of the fine-tuned document orientation classification model weights + model_name: PP-LCNet_x1_0_doc_ori # If the name of the fine-tuned model is different from the default model name, please also modify here + module_name: doc_text_orientation + DocUnwarping: + model_dir: null # Replace with the path of the fine-tuned document unwarping model weights + model_name: UVDoc # If the name of the fine-tuned model is different from the default model name, please also modify here + module_name: image_unwarping + pipeline_name: doc_preprocessor + use_doc_orientation_classify: true + use_doc_unwarping: true +...... +``` + +The pipeline configuration file not only contains the parameters supported by the SealRecognition CLI and Python API but also allows for more advanced configurations. Detailed information can be found in the [PaddleX Model pipeline Usage Overview](https://paddlepaddle.github.io/PaddleX/3.0/en/pipeline_usage/pipeline_develop_guide.html), where you can find the corresponding pipeline usage tutorial and adjust various configurations as needed. + +3. Load pipeline Configuration File in CLI + +After modifying the configuration file, specify the path of the modified pipeline configuration file using the --paddlex_config parameter in the command line. PaddleOCR will read its contents as the pipeline configuration. Example: + +```bash +paddleocr seal_recognition --paddlex_config SealRecognition.yaml ... +``` + +4. Load pipeline Configuration File in Python API + +When initializing the pipeline object, you can pass the PaddleX pipeline configuration file path or configuration dictionary through the paddlex_config parameter. PaddleOCR will read its contents as the pipeline configuration. Example: + +```python +from paddleocr import SealRecognition + +pipeline = SealRecognition(paddlex_config="SealRecognition.yaml") +``` diff --git a/docs/version3.x/pipeline_usage/seal_recognition.md b/docs/version3.x/pipeline_usage/seal_recognition.md index 4cfa0a993d6cd50d0e518981195f704281cea092..11633ebe01b719406606d216f3830d9fc6b29513 100644 --- a/docs/version3.x/pipeline_usage/seal_recognition.md +++ b/docs/version3.x/pipeline_usage/seal_recognition.md @@ -681,213 +681,188 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu input -待预测数据,支持多种输入类型,必填。 -
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • -
    +待预测数据,必填。 +如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + layout_detection_model_name -版面检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + layout_detection_model_dir -版面检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + seal_text_detection_model_name -印章文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +印章文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + seal_text_detection_model_dir -印章文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +印章文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_layout_detection -是否加载版面检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面检测模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_threshold -版面检测置信度阈值,得分大于该阈值才会被输出。 -
      -
    • float:大于 0 的任意浮点数 -
    • dict:key是int类别id, value是大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +版面模型得分阈值。0-1 之间的任意浮点数。如果不设置,将默认使用产线初始化的该参数值,初始化为 0.5。 -float|dict -None +float + layout_nms -版面检测是否使用后处理NMS。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +版面检测是否使用后处理NMS。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + layout_unclip_ratio 检测框的边长缩放倍数。 -
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • None:如果设置为None, 将默认使用产线初始化的该参数值,初始化为1.0
    • -
    +大于0的浮点数,如 1.1 ,表示将模型输出的检测框中心不变,宽和高都扩张1.1倍如果不设置,将默认使用产线初始化的该参数值,初始化为1.0。 -float|list -None +float + layout_merge_bboxes_mode 版面检测中模型输出的检测框的合并处理模式。
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • None:如果设置为None, 将默认使用产线初始化的该参数值,初始化为large
    • -
    +
  • large,设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框;
  • +
  • small,设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框;
  • +
  • union,不进行框的过滤处理,内外框都保留;
  • +如果不设置,将默认使用产线初始化的该参数值,初始化为largestr -None + seal_det_limit_side_len 印章文本检测的图像边长限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • -
    +大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 736int -None + seal_det_limit_type 印章文本检测的图像边长限制类型。 -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • -
    +支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 minstr -None + seal_det_thresh -检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    +检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.2float -None + seal_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + seal_det_unclip_ratio 印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.5float -None + seal_rec_score_thresh -文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +文本识别阈值,得分大于该阈值的文本结果会被保留。大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -895,11 +870,10 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -927,10 +901,10 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -942,7 +916,7 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu paddlex_config PaddleX产线配置文件路径。 str -None + @@ -952,16 +926,28 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu 运行结果会被打印到终端上,默认配置的 seal_recognition 产线的运行结果如下: ```bash -{'res': {'input_path': '/root/.paddlex/predict_input/seal_text_det.png', 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 0}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 16, 'label': 'seal', 'score': 0.9700419902801514, 'coordinate': [0.7737427, 2.4994812, 639.28375, 640]}]}, 'seal_res_list': [{'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': [array([[433, 32], +{'res': {'input_path': './seal_text_det.png', 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 16, 'label': 'seal', 'score': 0.975529670715332, 'coordinate': [6.191284, 0.16680908, 634.39325, 628.85345]}]}, 'seal_res_list': [{'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': [array([[320, 38], ..., - [323, 27]])], 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.2, 'box_thresh': 0.6, 'unclip_ratio': 0.5}, 'text_type': 'seal', 'textline_orientation_angles': array([-1]), 'text_rec_score_thresh': 0, 'rec_texts': ['天津君和缘商贸有限公司'], 'rec_scores': array([0.99743599]), 'rec_polys': [array([[433, 32], + [315, 38]]), array([[461, 347], ..., - [323, 27]])], 'rec_boxes': array([], dtype=float64)}]}} + [456, 346]]), array([[439, 445], + ..., + [434, 444]]), array([[158, 468], + ..., + [154, 466]])], 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.2, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 0.5}, 'text_type': 'seal', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0, 'rec_texts': ['天津君和缘商贸有限公司', '发票专用章', '吗繁物', '5263647368706'], 'rec_scores': array([0.99340463, ..., 0.9916274 ]), 'rec_polys': [array([[320, 38], + ..., + [315, 38]]), array([[461, 347], + ..., + [456, 346]]), array([[439, 445], + ..., + [434, 444]]), array([[158, 468], + ..., + [154, 466]])], 'rec_boxes': array([], dtype=float64)}]}} ``` 可视化结果保存在`save_path`下,其中印章OCR的可视化结果如下: - + ### 2.2 Python脚本方式集成 @@ -998,85 +984,85 @@ for res in output: doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 str None layout_detection_model_name -版面检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面检测模型的名称。如果设置为None,将会使用产线默认模型。 str None layout_detection_model_dir -版面检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面检测模型的目录路径。如果设置为None,将会下载官方模型。 str None seal_text_detection_model_name -印章文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +印章文本检测模型的名称。如果设置为None,将会使用产线默认模型。 str None seal_text_detection_model_dir -印章文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +印章文本检测模型的目录路径。如果设置为None,将会下载官方模型。 str None text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果设置为None,将会使用产线默认模型。 str None text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果设置为None,将会下载官方模型。 str None text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_layout_detection -是否加载版面检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面检测模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None @@ -1084,42 +1070,42 @@ for res in output: layout_threshold 版面检测置信度阈值,得分大于该阈值才会被输出。
      -
    • float:大于 0 的任意浮点数 -
    • dict:key是int类别id, value是大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +
  • float:大于0的任意浮点数; +
  • dict:key是int类别id,value是大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.5
  • float|dict None layout_nms -版面检测是否使用后处理NMS。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +版面检测是否使用后处理NMS。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None layout_unclip_ratio -检测框的边长缩放倍数。 +版面区域检测模型检测框的扩张系数。
      -
    • float, 大于0的浮点数,如 1.1 , 表示将模型输出的检测框中心不变,宽和高都扩张1.1倍
    • -
    • 列表, 如 [1.2, 1.5] , 表示将模型输出的检测框中心不变,宽度扩张1.2倍,高度扩张1.5倍
    • -
    • None:如果设置为None, 将默认使用产线初始化的该参数值,初始化为1.0
    • +
    • float:任意大于 0 浮点数;
    • +
    • Tuple[float,float]:在横纵两个方向各自的扩张系数;
    • +
    • dict,dict的key为int类型,代表cls_id, value为tuple类型,如{0: (1.1, 2.0)},表示将模型输出的第0类别检测框中心不变,宽度扩张1.1倍,高度扩张2.0倍;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 1.0
    -float|list +float|Tuple[float,float]|dict None layout_merge_bboxes_mode -版面检测中模型输出的检测框的合并处理模式。 +版面区域检测的重叠框过滤方式。
      -
    • large, 设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框。
    • -
    • small, 设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框。
    • -
    • union, 不进行框的过滤处理,内外框都保留
    • -
    • None:如果设置为None, 将默认使用产线初始化的该参数值,初始化为large
    • +
    • strlargesmallunion,分别表示重叠框过滤时选择保留大框,小框还是同时保留;
    • +
    • dict: dict的key为int类型,代表cls_id,value为str类型,如{0: "large", 2: "small"},表示对第0类别检测框使用large模式,对第2类别检测框使用small模式;
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 large
    -str +str|dict None @@ -1127,7 +1113,7 @@ for res in output: 印章文本检测的图像边长限制。
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 736
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 736
    int @@ -1137,8 +1123,8 @@ for res in output: seal_det_limit_type 印章文本检测的图像边长限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 min
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 min
    str @@ -1148,8 +1134,8 @@ for res in output: seal_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.2
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.2
  • float None @@ -1158,8 +1144,8 @@ for res in output: seal_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1168,8 +1154,8 @@ for res in output: seal_det_unclip_ratio 印章文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.5
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.5
  • float None @@ -1178,15 +1164,15 @@ for res in output: seal_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1194,7 +1180,7 @@ for res in output:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -1222,14 +1208,14 @@ for res in output: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1265,21 +1251,15 @@ for res in output: input 待预测数据,支持多种输入类型,必填。
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]。
    Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -1312,13 +1292,13 @@ for res in output: layout_unclip_ratio 与实例化时的参数相同。 -float +float|Tuple[float,float]|dict None layout_merge_bboxes_mode 与实例化时的参数相同。 -string +str|dict None @@ -1377,19 +1357,19 @@ for res in output: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1397,19 +1377,19 @@ for res in output: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1417,7 +1397,7 @@ for res in output: 将结果保存为图像格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1438,7 +1418,7 @@ for res in output: - `boxes`: `(List[Dict])` 版面印章区域的检测框列表,每个列表中的元素,包含以下字段 - `cls_id`: `(int)` 检测框的印章类别id - `score`: `(float)` 检测框的置信度 - - `coordinate`: `(List[float])` 检测框的四个顶点坐标,顺序为x1,y1,x2,y2表示左上角的x坐标,左上角的y坐标,右下角x坐标,右下角的y坐标 + - `coordinate`: `(List[float])` 检测框的四个顶点坐标,顺序为x1,y1,x2,y2表示左上角的x坐标,左上角的y坐标,右下角x坐标,右下角的y坐标 - `seal_res_list`: `List[Dict]` 印章文本识别的结果列表,每个元素包含以下字段 - `input_path`: `(Union[str, None])` 印章文本识别产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None` - `page_index`: `(Union[int, None])` 如果输入是PDF文件,则表示当前是PDF的第几页,否则为 `None` @@ -1498,7 +1478,7 @@ for res in output: - `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `layout_det_res` 、 `seal_res_region1`和 `preprocessed_img`,对应的值是三个 `Image.Image` 对象:一个用于显示版面检测可视化,一个用于显示印章文本识别结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则字典中不包含preprocessed_img,如果没有使用版面区域检测模块,则字典中不包含layout_det_res。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `layout_det_res` 、 `seal_res_region1`和 `preprocessed_img`,对应的值是三个 `Image.Image` 对象:一个用于显示版面检测可视化,一个用于显示印章文本识别结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则dict中不包含preprocessed_img,如果没有使用版面区域检测模块,则dict中不包含layout_det_res。 ## 3. 开发集成/部署 @@ -1782,6 +1762,7 @@ for i, res in enumerate(result["sealRecResults"]): ## 4. 二次开发 如果印章文本识别产线提供的默认模型权重在您的场景中,精度或速度不满意,您可以尝试利用您自己拥有的特定领域或应用场景的数据对现有模型进行进一步的微调,以提升印章文本识别产线的在您的场景中的识别效果。 +### 4.1 模型微调 由于印章文本识别产线包含若干模块,模型产线的效果如果不及预期,可能来自于其中任何一个模块。您可以对识别效果差的图片进行分析,进而确定是哪个模块存在问题,并参考以下表格中对应的微调教程链接进行模型微调。 @@ -1820,3 +1801,93 @@ for i, res in enumerate(result["sealRecResults"]):
    + +### 4.2 模型应用 + +当您使用私有数据集完成微调训练后,可获得本地模型权重文件,然后可以通过参数指定本地模型保存路径的方式,或者通过自定义产线配置文件的方式,使用微调后的模型权重。 + +#### 4.2.1 通过参数指定本地模型路径 + +在初始化产线对象时,通过参数指定本地模型路径。以文本检测模型微调后的权重的使用方法为例,示例如下: + +命令行方式: + +```bash +# 通过 --doc_orientation_classify_model_dir 指定本地模型路径 +paddleocr seal_recognition -i ./seal_text_det.png --doc_orientation_classify_model_dir your_orientation_classify_model_path + +# 默认使用 PP-LCNet_x1_0_doc_ori 模型作为默认文本检测模型,如果微调的不是该模型,通过 --text_detection_model_name 修改模型名称 +paddleocr seal_recognition -i ./seal_text_det.png --doc_orientation_classify_model_name PP-LCNet_x1_0_doc_ori --doc_orientation_classify_model_dir your_orientation_classify_model_path +``` + +脚本方式: + +```python + +from paddleocr import SealRecognition + +# 通过 doc_orientation_classify_model_dir 指定本地模型路径 +pipeline = SealRecognition(doc_orientation_classify_model_dir ="./your_orientation_classify_model_path") + +# 默认使用 PP-LCNet_x1_0_doc_ori 模型作为默认文本检测模型,如果微调的不是该模型,通过 doc_orientation_classify_model_name 修改模型名称 +# pipeline = SealRecognition(doc_orientation_classify_model_name="PP-LCNet_x1_0_doc_ori", doc_orientation_classify_model_dir="./your_orientation_classify_model_path") + +``` + + +#### 4.2.2 通过配置文件指定本地模型路径 + + +1. 获取产线配置文件 + +可调用 PaddleOCR 中 通用OCR 产线对象的 `export_paddlex_config_to_yaml` 方法,将当前产线配置导出为 YAML 文件: + +```Python +from paddleocr import SealRecognition + +pipeline = SealRecognition() +pipeline.export_paddlex_config_to_yaml("SealRecognition.yaml") +``` + +2. 修改配置文件 + +在得到默认的产线配置文件后,将微调后模型权重的本地路径替换至产线配置文件中的对应位置即可。例如 + +```yaml +...... +SubPipelines: + DocPreprocessor: + SubModules: + DocOrientationClassify: + model_dir: null # 替换为微调后的文档方向分类模型权重路径 + model_name: PP-LCNet_x1_0_doc_ori # 如果微调的模型名称与默认模型名称不同,请一并修改此处 + module_name: doc_text_orientation + DocUnwarping: + model_dir: null # 替换为微调后的文档矫正模型权重路径 + model_name: UVDoc # 如果微调的模型名称与默认模型名称不同,请一并修改此处 + module_name: image_unwarping + pipeline_name: doc_preprocessor + use_doc_orientation_classify: true + use_doc_unwarping: true +...... +``` + +在产线配置文件中,不仅包含 SealRecognition CLI 和 Python API 支持的参数,还可进行更多高级配置,具体信息可在 [PaddleX模型产线使用概览](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_usage/pipeline_develop_guide.html) 中找到对应的产线使用教程,参考其中的详细说明,根据需求调整各项配置。 + +3. 在 CLI 中加载产线配置文件 + +在修改完成配置文件后,通过命令行的 --paddlex_config 参数指定修改后的产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```bash +paddleocr seal_recognition --paddlex_config SealRecognition.yaml ... +``` + +4. 在 Python API 中加载产线配置文件 + +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```python +from paddleocr import SealRecognition + +pipeline = SealRecognition(paddlex_config="SealRecognition.yaml") +``` diff --git a/docs/version3.x/pipeline_usage/table_recognition_v2.en.md b/docs/version3.x/pipeline_usage/table_recognition_v2.en.md index 04bedd1bb147fc012f77fa4e153b58f90a140177..ef52f6dace3bff0c19afebae319657663fb71bcd 100644 --- a/docs/version3.x/pipeline_usage/table_recognition_v2.en.md +++ b/docs/version3.x/pipeline_usage/table_recognition_v2.en.md @@ -2,19 +2,19 @@ comments: true --- -# General Table Recognition v2 Production Line User Guide +# General Table Recognition V2 Pipeline Usage Tutorial -## 1. Introduction to General Table Recognition v2 Production Line +## 1. Introduction to General Table Recognition v2 pipeline Table recognition is a technology that automatically identifies and extracts table content and its structure from documents or images. It is widely used in fields such as data entry, information retrieval, and document analysis. By using computer vision and machine learning algorithms, table recognition can convert complex table information into an editable format, making it easier for users to further process and analyze data. -The General Table Recognition v2 Production Line (PP-TableMagic) is designed to tackle table recognition tasks, identifying tables in images and outputting them in HTML format. Unlike the original General Table Recognition Production Line, this version introduces two new modules: table classification and table cell detection. By adopting a multi-model pipeline combining "table classification + table structure recognition + cell detection", it achieves better end-to-end table recognition performance compared to the previous version. Based on this, the General Table Recognition v2 Production Line natively supports targeted model fine-tuning, allowing developers to customize it to varying degrees for satisfactory performance in different application scenarios. Furthermore, the General Table Recognition v2 Production Line also supports end-to-end table structure recognition models (e.g., SLANet, SLANet_plus, etc.) and allows independent configuration for wired and wireless table recognition methods, enabling developers to freely select and combine the best table recognition solutions. +The General Table Recognition v2 pipeline (PP-TableMagic) is designed to tackle table recognition tasks, identifying tables in images and outputting them in HTML format. Unlike the original General Table Recognition pipeline, this version introduces two new modules: table classification and table cell detection. By adopting a multi-model pipeline combining "table classification + table structure recognition + cell detection", it achieves better end-to-end table recognition performance compared to the previous version. Based on this, the General Table Recognition v2 pipeline natively supports targeted model fine-tuning, allowing developers to customize it to varying degrees for satisfactory performance in different application scenarios. Furthermore, the General Table Recognition v2 pipeline also supports end-to-end table structure recognition models (e.g., SLANet, SLANet_plus, etc.) and allows independent configuration for wired and wireless table recognition methods, enabling developers to freely select and combine the best table recognition solutions. -This production line is applicable in a variety of fields, including general, manufacturing, finance, and transportation. It also provides flexible service deployment options, supporting multiple programming languages on various hardware. Additionally, it offers capabilities for secondary development, allowing you to train and fine-tune your own datasets based on this production line, with the trained models seamlessly integrated. +This pipeline is applicable in a variety of fields, including general, manufacturing, finance, and transportation. It also provides flexible service deployment options, supporting multiple programming languages on various hardware. Additionally, it offers capabilities for secondary development, allowing you to train and fine-tune your own datasets based on this pipeline, with the trained models seamlessly integrated. -The General Table Recognition Production Line v2 includes the following 8 modules. Each module can be trained and inferred independently and contains multiple models. For detailed information, please click on the corresponding module to view the documentation. +The General Table Recognition Pipeline v2 includes the following 8 modules. Each module can be trained and inferred independently and contains multiple models. For detailed information, please click on the corresponding module to view the documentation. - [Table Structure Recognition Module](../module_usage/table_structure_recognition.md) - [Table Classification Module](../module_usage/table_classification.md) @@ -25,7 +25,7 @@ This production line is applicable in a variety of fields, including general, ma - [Document Image Orientation Classification Module](../module_usage/doc_img_orientation_classification.md) (optional) - [Text Image Unwarping Module](../module_usage/text_image_unwarping.md) (optional) -In this production line, you can choose the models to use based on the benchmark data below. +In this pipeline, you can choose the models to use based on the benchmark data below.
    Table Structure Recognition Module Models: @@ -799,13 +799,13 @@ A single command allows you to quickly experience the effects of the table_recog paddleocr table_recognition_v2 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition_v2.jpg # Specify whether to use the document orientation classification model with --use_doc_orientation_classify -paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --use_doc_orientation_classify True +paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --use_doc_orientation_classify True # Specify whether to use the text image unwarping module with --use_doc_unwarping -paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --use_doc_unwarping True +paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --use_doc_unwarping True # Specify the device to use GPU for model inference with --device -paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device gpu +paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --device gpu ```
    More command line parameters are supported. Click to expand for detailed descriptions of the command line parameters @@ -821,241 +821,219 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device input -Data to be predicted, supports multiple input types, required. -
      -
    • Python Var: For example, image data represented as numpy.ndarray.
    • -
    • str: Local path to image files or PDF files: /root/data/img.jpg; as URL links, such as network URLs for image files or PDF files: example; as local directories, the directory must contain images to be predicted, such as local path: /root/data/ (currently, predictions do not support directories that contain PDF files; the PDF file must be specified to the specific file path).
    • -
    • List: The elements of the list must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
    • -
    +Data to be predicted, required. +Local path to image files or PDF files: /root/data/img.jpg; as URL links, such as network URLs for image files or PDF files: example; as local directories, the directory must contain images to be predicted, such as local path: /root/data/ (currently, predictions do not support directories that contain PDF files; the PDF file must be specified to the specific file path). -Python Var|str|list +str save_path -Specify the path to save the inference result file. If set to None, the inference result will not be saved locally. +Specify the path to save the inference result file. If not set, the inference result will not be saved locally. str -None + layout_detection_model_name -Name of the layout detection model. If set to None, the default model of the pipeline will be used. +Name of the layout detection model. If not set, the default model of the pipeline will be used. str -None + layout_detection_model_dir -Directory path of the layout detection model. If set to None, the official model will be downloaded. +Directory path of the layout detection model. If not set, the official model will be downloaded. str -None + table_classification_model_name -Name of the table classification model. If set to None, the default model of the pipeline will be used. +Name of the table classification model. If not set, the default model of the pipeline will be used. str -None + table_classification_model_dir -Directory path of the table classification model. If set to None, the official model will be downloaded. +Directory path of the table classification model. If not set, the official model will be downloaded. str -None + wired_table_structure_recognition_model_name -Name of the wired table structure recognition model. If set to None, the default model of the pipeline will be used. +Name of the wired table structure recognition model. If not set, the default model of the pipeline will be used. str -None + wired_table_structure_recognition_model_dir -Directory path of the wired table structure recognition model. If set to None, the official model will be downloaded. +Directory path of the wired table structure recognition model. If not set, the official model will be downloaded. str -None + wireless_table_structure_recognition_model_name -Name of the wireless table structure recognition model. If set to None, the default model of the pipeline will be used. +Name of the wireless table structure recognition model. If not set, the default model of the pipeline will be used. str -None + wireless_table_structure_recognition_model_dir -Directory path of the wireless table structure recognition model. If set to None, the official model will be downloaded. +Directory path of the wireless table structure recognition model. If not set, the official model will be downloaded. str -None + wired_table_cells_detection_model_name -Name of the wired table cell detection model. If set to None, the default model of the pipeline will be used. +Name of the wired table cell detection model. If not set, the default model of the pipeline will be used. str -None + wired_table_cells_detection_model_dir -Directory path of the wired table cell detection model. If set to None, the official model will be downloaded. +Directory path of the wired table cell detection model. If not set, the official model will be downloaded. str -None + wireless_table_cells_detection_model_name -Name of the wireless table cell detection model. If set to None, the default model of the pipeline will be used. +Name of the wireless table cell detection model. If not set, the default model of the pipeline will be used. str -None + wireless_table_cells_detection_model_dir -Directory path of the wireless table cell detection model. If set to None, the official model will be downloaded. +Directory path of the wireless table cell detection model. If not set, the official model will be downloaded. str -None + doc_orientation_classify_model_name -Name of the document orientation classification model. If set to None, the default model of the pipeline will be used. +Name of the document orientation classification model. If not set, the default model of the pipeline will be used. str -None + doc_orientation_classify_model_dir -Directory path of the document orientation classification model. If set to None, the official model will be downloaded. +Directory path of the document orientation classification model. If not set, the official model will be downloaded. str -None + doc_unwarping_model_name -Name of the text image unwarping model. If set to None, the default model of the pipeline will be used. +Name of the text image unwarping model. If not set, the default model of the pipeline will be used. str -None + doc_unwarping_model_dir -Directory path of the text image unwarping model. If set to None, the official model will be downloaded. +Directory path of the text image unwarping model. If not set, the official model will be downloaded. str -None + text_detection_model_name -Name of the text detection model. If set to None, the default model of the pipeline will be used. +Name of the text detection model. If not set, the default model of the pipeline will be used. str -None + text_detection_model_dir -Directory path of the text detection model. If set to None, the official model will be downloaded. +Directory path of the text detection model. If not set, the official model will be downloaded. str -None + text_det_limit_side_len Image side length limit for text detection. -
      -
    • int: Any integer greater than 0;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to 960;
    • -
    +Any integer greater than 0. If not set, the default value initialized by the pipeline will be used, initialized to 960. int -None + text_det_limit_type Type of the image side length limit for text detection. -
      -
    • str: Supports min and max. min ensures that the shortest side of the image is not less than det_limit_side_len, while max ensures that the longest side of the image is not greater than limit_side_len.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to max;
    • -
    +Supports min and max. min ensures that the shortest side of the image is not less than det_limit_side_len, while max ensures that the longest side of the image is not greater than limit_side_len. If not set, the default value initialized by the pipeline will be used, initialized to max. str -None + text_det_thresh Detection pixel threshold. In the output probability map, only pixels with a score greater than this threshold will be considered text pixels. -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.3.
    • -
    +Any floating-point number greater than 0. If not set, the default value initialized by the pipeline will be used, which is 0.3. float -None + text_det_box_thresh Detection box threshold. When the average score of all pixels within the detection result box is greater than this threshold, the result is considered a text area. -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.6.
    • -
    +Any floating-point number greater than 0. If not set, the default value initialized by the pipeline will be used, which is 0.6. float -None + text_det_unclip_ratio Text detection expansion coefficient. This method expands the text area; the larger this value, the larger the expanded area. -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which is 2.0.
    • -
    +Any floating-point number greater than 0. If not set, the default value initialized by the pipeline will be used, which is 2.0. float -None + text_recognition_model_name -Name of the text recognition model. If set to None, the default model of the pipeline will be used. +Name of the text recognition model. If not set, the default model of the pipeline will be used. str -None + text_recognition_model_dir -Directory path of the text recognition model. If set to None, the official model will be downloaded. +Directory path of the text recognition model. If not set, the official model will be downloaded. str -None + text_recognition_batch_size -Batch size for the text recognition model. If set to None, the default batch size will be set to 1. +Batch size for the text recognition model. If not set, the default batch size will be set to 1. int -None + text_rec_score_thresh Text recognition threshold. Text results with a score greater than this threshold will be retained. -
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.0. That is, no threshold is set.
    • -
    +Any floating-point number greater than 0. If not set, the default value initialized by the pipeline will be used, which is 0.0. That is, no threshold is set. float -None + use_doc_orientation_classify -Whether to load the document orientation classification module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the document orientation classification module. If not set, the default value initialized by the pipeline will be used, initialized to True. bool -None + use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the text image unwarping module. If not set, the default value initialized by the pipeline will be used, initialized to True. bool -None + use_layout_detection -Whether to load the layout detection module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the layout detection module. If not set, the default value initialized by the pipeline will be used, initialized to True. bool -None + use_ocr_model -Whether to load the OCR module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the OCR module. If not set, the default value initialized by the pipeline will be used, initialized to True. bool -None + device -The device used for inference. Supports specifying a specific card number. +The device used for inference. Supports specifying a specific card number:
    • CPU: For example, cpu indicates using CPU for inference;
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • @@ -1063,11 +1041,10 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which prioritizes using the local GPU device 0; if not available, it will use the CPU device.
    • -
    +If not set, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used. str -None + enable_hpi @@ -1095,9 +1072,9 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -1109,21 +1086,51 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device paddlex_config Path to PaddleX pipeline configuration file. str -None +

    +To run inference on the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition_v2.jpg), you can use the following command: + +```bash +paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --use_doc_orientation_classify False --use_doc_unwarping False +``` + The running results will be printed to the terminal. The default configuration of the table_recognition_v2 pipeline's running results is as follows: ``` -{'res': {'input_path': '/root/.paddlex/predict_input/table_recognition_v2.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True, 'use_ocr_model': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 180}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 18, 'label': 'chart', 'score': 0.6778535842895508, 'coordinate': [0, 0, 1281.0206, 585.5999]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 4, 301], +{'res': {'input_path': 'table_recognition_v2.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_ocr_model': True}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 8, 'label': 'table', 'score': 0.86655592918396, 'coordinate': [0.0125130415, 0.41920784, 1281.3737, 585.3884]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 9, 21], + ..., + [ 9, 59]], + + ..., + + [[1046, 536], ..., - [ 4, 334]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.4, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1]), 'text_rec_score_thresh': 0, 'rec_texts': ['其'], 'rec_scores': array([0.97335929]), 'rec_polys': array([[[ 4, 301], + [1046, 573]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.6, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0, 'rec_texts': ['部门', '报销人', '报销事由', '批准人:', '单据', '张', '合计金额', '元', '车费票', '其', '火车费票', '飞机票', '中', '旅住宿费', '其他', '补贴'], 'rec_scores': array([0.99958128, ..., 0.99317062]), 'rec_polys': array([[[ 9, 21], ..., - [ 4, 334]]], dtype=int16), 'rec_boxes': array([[ 4, ..., 334]], dtype=int16)}, 'table_res_list': []}} + [ 9, 59]], + + ..., + + [[1046, 536], + ..., + [1046, 573]]], dtype=int16), 'rec_boxes': array([[ 9, ..., 59], + ..., + [1046, ..., 573]], dtype=int16)}, 'table_res_list': [{'cell_box_list': [array([ 0.13052222, ..., 73.08310249]), array([104.43082511, ..., 73.27777413]), array([319.39041221, ..., 73.30439308]), array([424.2436837 , ..., 73.44736794]), array([580.75836265, ..., 73.24003914]), array([723.04370201, ..., 73.22717598]), array([984.67315757, ..., 73.20420387]), array([1.25130415e-02, ..., 5.85419208e+02]), array([984.37072837, ..., 137.02281502]), array([984.26586998, ..., 201.22290352]), array([984.24017417, ..., 585.30775765]), array([1039.90606773, ..., 265.44664314]), array([1039.69549644, ..., 329.30540779]), array([1039.66546714, ..., 393.57319954]), array([1039.5122689 , ..., 457.74644783]), array([1039.55535972, ..., 521.73030403]), array([1039.58612144, ..., 585.09468392])], 'pred_html': '
    部门报销人报销事由批准人:
    单据 张
    合计金额 元
    其 中车费票
    火车费票
    飞机票
    旅住宿费
    其他
    补贴
    ', 'table_ocr_pred': {'rec_polys': array([[[ 9, 21], + ..., + [ 9, 59]], + + ..., + + [[1046, 536], + ..., + [1046, 573]]], dtype=int16), 'rec_texts': ['部门', '报销人', '报销事由', '批准人:', '单据', '张', '合计金额', '元', '车费票', '其', '火车费票', '飞机票', '中', '旅住宿费', '其他', '补贴'], 'rec_scores': array([0.99958128, ..., 0.99317062]), 'rec_boxes': array([[ 9, ..., 59], + ..., + [1046, ..., 573]], dtype=int16)}}]}} ``` The visualization results are saved under `save_path`, and the visualization results are as follows: @@ -1141,7 +1148,7 @@ pipeline = TableRecognitionPipelineV2() # ocr = TableRecognitionPipelineV2(use_doc_orientation_classify=True) # Specify whether to use the document orientation classification model with use_doc_orientation_classify # ocr = TableRecognitionPipelineV2(use_doc_unwarping=True) # Specify whether to use the text image unwarping module with use_doc_unwarping # ocr = TableRecognitionPipelineV2(device="gpu") # Specify the device to use GPU for model inference -output = pipeline.predict("./general_formula_recognition_001.png") +output = pipeline.predict("./table_recognition_v2.jpg") for res in output: res.print() ## Print the predicted structured output res.save_to_img("./output/") @@ -1277,7 +1284,7 @@ In the above Python script, the following steps are performed: Image side length limit for text detection.
    • int: Any integer greater than 0;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to 960;
    • +
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to 960.
    int @@ -1287,8 +1294,8 @@ In the above Python script, the following steps are performed: text_det_limit_type Type of the image side length limit for text detection.
      -
    • str: Supports min and max. min ensures that the shortest side of the image is not less than det_limit_side_len, while max ensures that the longest side of the image is not greater than limit_side_len.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to max;
    • +
    • str: Supports min and max. min ensures that the shortest side of the image is not less than det_limit_side_len, while max ensures that the longest side of the image is not greater than limit_side_len;
    • +
    • None: If set to None, the default value initialized by the pipeline will be used, initialized to max.
    str @@ -1298,7 +1305,7 @@ In the above Python script, the following steps are performed: text_det_thresh Detection pixel threshold. In the output probability map, only pixels with a score greater than this threshold will be considered text pixels.
      -
    • float: Any floating-point number greater than 0.
    • +
    • float: Any floating-point number greater than 0;
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.3.
    @@ -1309,7 +1316,7 @@ In the above Python script, the following steps are performed: text_det_box_thresh Detection box threshold. When the average score of all pixels within the detection result box is greater than this threshold, the result is considered a text area.
      -
    • float: Any floating-point number greater than 0.
    • +
    • float: Any floating-point number greater than 0;
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.6.
    @@ -1320,7 +1327,7 @@ In the above Python script, the following steps are performed: text_det_unclip_ratio Text detection expansion coefficient. This method expands the text area; the larger this value, the larger the expanded area.
      -
    • float: Any floating-point number greater than 0.
    • +
    • float: Any floating-point number greater than 0;
    • None: If set to None, the default value initialized by the pipeline will be used, which is 2.0.
    @@ -1349,40 +1356,40 @@ In the above Python script, the following steps are performed: text_rec_score_thresh Text recognition threshold. Text results with a score greater than this threshold will be retained.
      -
    • float: Any floating-point number greater than 0.
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which is 0.0. That is, no threshold is set.
    • -
    +
  • float: Any floating-point number greater than 0;
  • +
  • None: If set to None, the default value initialized by the pipeline will be used, which is 0.0. That is, no threshold is set. + float None use_doc_orientation_classify -Whether to load the document orientation classification module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the document orientation classification module. If set to None, the default value initialized by the pipeline will be used, initialized to True. bool None use_doc_unwarping -Whether to load the text image unwarping module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the text image unwarping module. If set to None, the default value initialized by the pipeline will be used, initialized to True. bool None use_layout_detection -Whether to load the layout detection module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the layout detection module. If set to None, the default value initialized by the pipeline will be used, initialized to True. bool None use_ocr_model -Whether to load the OCR module. If set to None, the default value initialized by the pipeline will be used, initialized to True. +Whether to load and use the OCR module. If set to None, the default value initialized by the pipeline will be used, initialized to True. bool None device -The device used for inference. Supports specifying a specific card number. +The device used for inference. Supports specifying a specific card number:
    • CPU: For example, cpu indicates using CPU for inference;
    • GPU: For example, gpu:0 indicates using the first GPU for inference;
    • @@ -1390,7 +1397,7 @@ In the above Python script, the following steps are performed:
    • XPU: For example, xpu:0 indicates using the first XPU for inference;
    • MLU: For example, mlu:0 indicates using the first MLU for inference;
    • DCU: For example, dcu:0 indicates using the first DCU for inference;
    • -
    • None: If set to None, the default value initialized by the pipeline will be used, which prioritizes using the local GPU device 0; if not available, it will use the CPU device.
    • +
    • None: If set to None, the pipeline initialized value for this parameter will be used. During initialization, the local GPU device 0 will be preferred; if unavailable, the CPU device will be used.
    str @@ -1418,13 +1425,13 @@ In the above Python script, the following steps are performed: precision Computation precision, such as fp32, fp16. str -fp32 +"fp32" enable_mkldnn -Whether to enable the MKL-DNN acceleration library. If set to None, it will be enabled by default. +Whether to enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. bool -None +True cpu_threads @@ -1461,8 +1468,8 @@ The parameters and descriptions of the `predict()` method are as follows: input Data to be predicted, supports multiple input types, required.
      -
    • Python Var: For example, image data represented as numpy.ndarray.
    • -
    • str: Local path to image files or PDF files: /root/data/img.jpg; as URL links, such as network URLs for image files or PDF files: example; as local directories, the directory must contain images to be predicted, such as local path: /root/data/ (currently, predictions do not support directories that contain PDF files; the PDF file must be specified to the specific file path).
    • +
    • Python Var: For example, image data represented as numpy.ndarray;
    • +
    • str: Local path to image files or PDF files: /root/data/img.jpg; as URL links, such as network URLs for image files or PDF files: example; as local directories, the directory must contain images to be predicted, such as local path: /root/data/ (currently, predictions do not support directories that contain PDF files; the PDF file must be specified to the specific file path);
    • List: The elements of the list must be of the above types, such as [numpy.ndarray, numpy.ndarray], ["/root/data/img1.jpg", "/root/data/img2.jpg"], ["/root/data1", "/root/data2"].
    @@ -1470,12 +1477,6 @@ The parameters and descriptions of the `predict()` method are as follows: -device -Same as the parameters during instantiation. -str -None - - use_doc_orientation_classify Whether to use the document orientation classification module during inference. bool @@ -1593,19 +1594,19 @@ The parameters and descriptions of the `predict()` method are as follows: Print results to the terminal format_json bool -Whether to format the output content using JSON indentation +Whether to format the output content using JSON indentation. True indent int -Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True +Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False keeps the original characters. Effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False keeps the original characters. Effective only when format_json is True. False @@ -1619,13 +1620,13 @@ The parameters and descriptions of the `predict()` method are as follows: indent int -Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True +Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True. 4 ensure_ascii bool -Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False keeps the original characters. Effective only when format_json is True +Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False keeps the original characters. Effective only when format_json is True. False @@ -1633,7 +1634,7 @@ The parameters and descriptions of the `predict()` method are as follows: Save results as an image format file save_path str -The path to save the file, supporting directory or file path +The path to save the file, supporting directory or file path. None @@ -1641,7 +1642,7 @@ The parameters and descriptions of the `predict()` method are as follows: Save results as an xlsx format file save_path str -The path to save the file, supporting directory or file path +The path to save the file, supporting directory or file path. None @@ -1649,7 +1650,7 @@ The parameters and descriptions of the `predict()` method are as follows: Save results as an html format file save_path str -The path to save the file, supporting directory or file path +The path to save the file, supporting directory or file path. None @@ -1871,30 +1872,6 @@ Below is the API reference for basic service-oriented deployment and examples of No -layoutThreshold -number | null -Please refer to the layout_threshold parameter description in the predict method of the model object. -No - - -layoutNms -boolean | null -Please refer to the layout_nms parameter description in the predict method of the model object. -No - - -layoutUnclipRatio -number | array | null -Please refer to the layout_unclip_ratio parameter description in the predict method of the model object. -No - - -layoutMergeBboxesMode -string | null -Please refer to the layout_merge_bboxes_mode parameter description in the predict method of the model object. -No - - textDetLimitSideLen integer | null Please refer to the text_det_limit_side_len parameter description in the predict method of the model object. @@ -2050,7 +2027,7 @@ Since the General Table Recognition v2 model consists of several modules, if the Table classification error Table Classification Module -Link +Link Table cell location error @@ -2089,3 +2066,147 @@ Since the General Table Recognition v2 model consists of several modules, if the + + +### 4.2 Model Application + +When you complete fine-tuning with your private dataset, you’ll obtain local model weight files. You can then use these fine-tuned model weights either by specifying the local model save path through parameters or by customizing the pipeline configuration file. + +#### 4.2.1 Specifying Local Model Path Through Parameters + +When initializing the pipeline object, you can specify the local model path via parameters. Taking the use of fine-tuned weights for the SLANeXt_wired table structure recognition model as an example: + +Command line method: + +```bash +# Specify local model path via --wired_table_structure_recognition_model_dir +paddleocr table_recognition_v2_pipeline -i ./table_recognition_v2.jpg --wired_table_structure_recognition_model_dir your_model_path +``` + +```bash +# If using SLANeXt_wired model as the default wired table structure recognition model, and if you fine-tuned a different model, modify the model name via --wired_table_structure_recognition_model_name +paddleocr table_recognition_v2_pipeline -i ./table_recognition_v2.jpg --wired_table_structure_recognition_model_name SLANeXt_wired --wired_table_structure_recognition_model_dir your_model_path +``` + +Python script method: + +```python +from paddleocr import TableRecognitionPipelineV2 + +# Specify local model path via wired_table_structure_recognition_model_dir +pipeline = TableRecognitionPipelineV2(wired_table_structure_recognition_model_dir="./your_model_path") + +# By default, SLANeXt_wired is used as the default table recognition model. If you fine-tuned a different model, modify the model name via wired_table_structure_recognition_model_name +# pipeline = PaddleOCR(wired_table_structure_recognition_model_name="SLANeXt_wired", wired_table_structure_recognition_model_dir="./your_model_path") +``` + +#### 4.2.2 Specifying Local Model Path Through Configuration File + +1.Obtain the pipeline configuration file +You can call the `export_paddlex_config_to_yaml method` of the TableRecognitionPipelineV2 object in PaddleOCR to export the current pipeline configuration to a YAML file: + +```Python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2() +pipeline.export_paddlex_config_to_yaml("TableRecognitionPipelineV2.yaml") +``` + +2.Modify the configuration file + +After obtaining the default pipeline configuration file, simply replace the corresponding paths in the pipeline configuration file with the local paths of your fine-tuned model weights. For example: + +```yaml +...... +SubModules: + LayoutDetection: + module_name: layout_detection + model_name: PicoDet_layout_1x_table + model_dir: null # Replace with the path to fine-tuned layout detection model weights + + TableClassification: + module_name: table_classification + model_name: PP-LCNet_x1_0_table_cls + model_dir: null # Replace with the path to fine-tuned table classification model weights + + WiredTableStructureRecognition: + module_name: table_structure_recognition + model_name: SLANeXt_wired + model_dir: null # Replace with the path to fine-tuned wired table structure recognition model weights + + WirelessTableStructureRecognition: + module_name: table_structure_recognition + model_name: SLANeXt_wireless + model_dir: null # Replace with the path to fine-tuned wireless table structure recognition model weights + + WiredTableCellsDetection: + module_name: table_cells_detection + model_name: RT-DETR-L_wired_table_cell_det + model_dir: null # Replace with the path to fine-tuned wired table cell detection model weights + + WirelessTableCellsDetection: + module_name: table_cells_detection + model_name: RT-DETR-L_wireless_table_cell_det + model_dir: null # Replace with the path to fine-tuned wireless table cell detection model weights + +SubPipelines: + DocPreprocessor: + pipeline_name: doc_preprocessor + use_doc_orientation_classify: True + use_doc_unwarping: True + SubModules: + DocOrientationClassify: + module_name: doc_text_orientation + model_name: PP-LCNet_x1_0_doc_ori + model_dir: null # Replace with the path to fine-tuned document orientation classification model weights + + DocUnwarping: + module_name: image_unwarping + model_name: UVDoc + model_dir: null + + GeneralOCR: + pipeline_name: OCR + text_type: general + use_doc_preprocessor: False + use_textline_orientation: False + SubModules: + TextDetection: + module_name: text_detection + model_name: PP-OCRv5_server_det + model_dir: null # Replace with the path to fine-tuned text detection model weights + limit_side_len: 960 + limit_type: max + max_side_limit: 4000 + thresh: 0.3 + box_thresh: 0.4 + unclip_ratio: 1.5 + + TextRecognition: + module_name: text_recognition + model_name: PP-OCRv5_server_rec + model_dir: null # Replace with the path to fine-tuned text recognition model weights + batch_size: 1 + score_thresh: 0 +...... +``` + +The pipeline configuration file includes not only the parameters supported by PaddleOCR CLI and Python API but also allows for more advanced configurations. For detailed information, you can find the corresponding pipeline usage tutorial in [Overview of PaddleX Model Pipeline Usage](https://paddlepaddle.github.io/PaddleX/3.0/en/pipeline_usage/pipeline_develop_guide.html), and refer to the detailed instructions to adjust the configurations according to your needs. + +3. Loading the pipeline configuration file in CLI + +After completing the configuration file modifications, specify the path to the modified pipeline configuration file using the --paddlex_config parameter in the command line. PaddleOCR will read its contents as the pipeline configuration. For example: + +```bash +paddleocr table_recognition_v2_pipeline --paddlex_config PaddleOCR.yaml ... +``` + +4. Loading the pipeline configuration file in Python API + +When initializing the pipeline object, you can pass the PaddleX pipeline configuration file path or configuration dictionary through the paddlex_config parameter. PaddleOCR will read its contents as the pipeline configuration. For example: + +```python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2(paddlex_config="TableRecognitionPipelineV2.yaml") +``` diff --git a/docs/version3.x/pipeline_usage/table_recognition_v2.md b/docs/version3.x/pipeline_usage/table_recognition_v2.md index 5bd83b7a8a2e8dd12fc25fb1c31b88e58d60d025..8a4b0fa3c65c3f98cff08ab6138baeff3bc7cb22 100644 --- a/docs/version3.x/pipeline_usage/table_recognition_v2.md +++ b/docs/version3.x/pipeline_usage/table_recognition_v2.md @@ -18,7 +18,7 @@ comments: true - [表格结构识别模块](../module_usage/table_structure_recognition.md) - [表格分类模块](../module_usage/table_classification.md) -- [表格单元格定位模块](../module_usage/table_cells_detection.md) +- [表格单元格检测模块](../module_usage/table_cells_detection.md) - [文本检测模块](../module_usage/text_detection.md) - [文本识别模块](../module_usage/text_recognition.md) - [版面区域检测模块](../module_usage/layout_detection.md)(可选) @@ -803,13 +803,13 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
  • -
  • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
  • - +待预测数据,必填。 +如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)。 -Python Var|str|list +str save_path -指定推理结果文件保存的路径。如果设置为None, 推理结果将不会保存到本地。 +指定推理结果文件保存的路径。如果不设置,推理结果将不会保存到本地。 str -None + layout_detection_model_name -版面检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + layout_detection_model_dir -版面检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + table_classification_model_name -表格分类模型的名称。如果设置为None, 将会使用产线默认模型。 +表格分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + table_classification_model_dir -表格分类模型的目录路径。如果设置为None, 将会下载官方模型。 +表格分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + wired_table_structure_recognition_model_name -有线表格结构识别模型的名称。如果设置为None, 将会使用产线默认模型。 +有线表格结构识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + wired_table_structure_recognition_model_dir -有线表格结构识别模型的目录路径。如果设置为None, 将会下载官方模型。 +有线表格结构识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + wireless_table_structure_recognition_model_name -无线表格结构识别模型的名称。如果设置为None, 将会使用产线默认模型。 +无线表格结构识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + wireless_table_structure_recognition_model_dir -无线表格结构识别模型的目录路径。如果设置为None, 将会下载官方模型。 +无线表格结构识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + wired_table_cells_detection_model_name -有线表格单元检测模型的名称。如果设置为None, 将会使用产线默认模型。 +有线表格单元检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + wired_table_cells_detection_model_dir -有线表格单元检测模型的目录路径。如果设置为None, 将会下载官方模型。 +有线表格单元检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + wireless_table_cells_detection_model_name -无线表格单元检测模型的名称。如果设置为None, 将会使用产线默认模型。 +无线表格单元检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + wireless_table_cells_detection_model_dir -无线表格单元检测模型的目录路径。如果设置为None, 将会下载官方模型。 +无线表格单元检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果不设置,将会下载官方模型。 str -None + doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果不设置,将会使用产线默认模型。 str -None + doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_detection_model_name -文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +文本检测模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_detection_model_dir -文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +文本检测模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_det_limit_side_len 文本检测的图像边长限制。 -
      -
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • -
    +大于 0 的任意整数。如果不设置,将默认使用产线初始化的该参数值,初始化为 960int -None + text_det_limit_type 文本检测的图像边长限制类型。 -
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • -
    +支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len。如果不设置,将默认使用产线初始化的该参数值,初始化为 maxstr -None + text_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.3float -None + text_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.6float -None + text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 2.0float -None + text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果不设置,将会使用产线默认模型。 str -None + text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果不设置,将会下载官方模型。 str -None + text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果不设置,将默认设置批处理大小为1int -None + text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。 -
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +大于0的任意浮点数 +。如果不设置,将默认使用产线初始化的该参数值 0.0。即不设阈值。 float -None + use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_layout_detection -是否加载版面检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面检测模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + use_ocr_model -是否加载OCR模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用OCR模块。如果不设置,将默认使用产线初始化的该参数值,初始化为Truebool -None + device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1063,11 +1049,10 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • -
    +如果不设置,将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str -None + enable_hpi @@ -1095,10 +1080,10 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1110,21 +1095,50 @@ paddleocr table_recognition_v2 -i ./general_formula_recognition_001.png --device paddlex_config PaddleX产线配置文件路径。 str -None +

    +接下来,通过下列一行命令来对[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition_v2.jpg)进行推理: +```bash +paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --use_doc_orientation_classify False --use_doc_unwarping False +``` + 运行结果会被打印到终端上,默认配置的 table_recognition_v2 产线的运行结果如下: ``` -{'res': {'input_path': '/root/.paddlex/predict_input/table_recognition_v2.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True, 'use_ocr_model': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 180}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 18, 'label': 'chart', 'score': 0.6778535842895508, 'coordinate': [0, 0, 1281.0206, 585.5999]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 4, 301], +{'res': {'input_path': 'table_recognition_v2.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_ocr_model': True}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 8, 'label': 'table', 'score': 0.86655592918396, 'coordinate': [0.0125130415, 0.41920784, 1281.3737, 585.3884]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 9, 21], ..., - [ 4, 334]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.4, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1]), 'text_rec_score_thresh': 0, 'rec_texts': ['其'], 'rec_scores': array([0.97335929]), 'rec_polys': array([[[ 4, 301], + [ 9, 59]], + + ..., + + [[1046, 536], + ..., + [1046, 573]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.6, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0, 'rec_texts': ['部门', '报销人', '报销事由', '批准人:', '单据', '张', '合计金额', '元', '车费票', '其', '火车费票', '飞机票', '中', '旅住宿费', '其他', '补贴'], 'rec_scores': array([0.99958128, ..., 0.99317062]), 'rec_polys': array([[[ 9, 21], + ..., + [ 9, 59]], + + ..., + + [[1046, 536], ..., - [ 4, 334]]], dtype=int16), 'rec_boxes': array([[ 4, ..., 334]], dtype=int16)}, 'table_res_list': []}} + [1046, 573]]], dtype=int16), 'rec_boxes': array([[ 9, ..., 59], + ..., + [1046, ..., 573]], dtype=int16)}, 'table_res_list': [{'cell_box_list': [array([ 0.13052222, ..., 73.08310249]), array([104.43082511, ..., 73.27777413]), array([319.39041221, ..., 73.30439308]), array([424.2436837 , ..., 73.44736794]), array([580.75836265, ..., 73.24003914]), array([723.04370201, ..., 73.22717598]), array([984.67315757, ..., 73.20420387]), array([1.25130415e-02, ..., 5.85419208e+02]), array([984.37072837, ..., 137.02281502]), array([984.26586998, ..., 201.22290352]), array([984.24017417, ..., 585.30775765]), array([1039.90606773, ..., 265.44664314]), array([1039.69549644, ..., 329.30540779]), array([1039.66546714, ..., 393.57319954]), array([1039.5122689 , ..., 457.74644783]), array([1039.55535972, ..., 521.73030403]), array([1039.58612144, ..., 585.09468392])], 'pred_html': '
    部门报销人报销事由批准人:
    单据 张
    合计金额 元
    其 中车费票
    火车费票
    飞机票
    旅住宿费
    其他
    补贴
    ', 'table_ocr_pred': {'rec_polys': array([[[ 9, 21], + ..., + [ 9, 59]], + + ..., + + [[1046, 536], + ..., + [1046, 573]]], dtype=int16), 'rec_texts': ['部门', '报销人', '报销事由', '批准人:', '单据', '张', '合计金额', '元', '车费票', '其', '火车费票', '飞机票', '中', '旅住宿费', '其他', '补贴'], 'rec_scores': array([0.99958128, ..., 0.99317062]), 'rec_boxes': array([[ 9, ..., 59], + ..., + [1046, ..., 573]], dtype=int16)}}]}} ``` 可视化结果保存在`save_path`下,可视化结果如下: @@ -1142,7 +1156,7 @@ pipeline = TableRecognitionPipelineV2() # ocr = TableRecognitionPipelineV2(use_doc_orientation_classify=True) # 通过 use_doc_orientation_classify 指定是否使用文档方向分类模型 # ocr = TableRecognitionPipelineV2(use_doc_unwarping=True) # 通过 use_doc_unwarping 指定是否使用文本图像矫正模块 # ocr = TableRecognitionPipelineV2(device="gpu") # 通过 device 指定模型推理时使用 GPU -output = pipeline.predict("./general_formula_recognition_001.png") +output = pipeline.predict("./table_recognition_v2.jpg") for res in output: res.print() ## 打印预测的结构化输出 res.save_to_img("./output/") @@ -1167,109 +1181,109 @@ for res in output: layout_detection_model_name -版面检测模型的名称。如果设置为None, 将会使用产线默认模型。 +版面检测模型的名称。如果设置为None,将会使用产线默认模型。 str None layout_detection_model_dir -版面检测模型的目录路径。如果设置为None, 将会下载官方模型。 +版面检测模型的目录路径。如果设置为None,将会下载官方模型。 str None table_classification_model_name -表格分类模型的名称。如果设置为None, 将会使用产线默认模型。 +表格分类模型的名称。如果设置为None,将会使用产线默认模型。 str None table_classification_model_dir -表格分类模型的目录路径。如果设置为None, 将会下载官方模型。 +表格分类模型的目录路径。如果设置为None,将会下载官方模型。 str None wired_table_structure_recognition_model_name -有线表格结构识别模型的名称。如果设置为None, 将会使用产线默认模型。 +有线表格结构识别模型的名称。如果设置为None,将会使用产线默认模型。 str None wired_table_structure_recognition_model_dir -有线表格结构识别模型的目录路径。如果设置为None, 将会下载官方模型。 +有线表格结构识别模型的目录路径。如果设置为None,将会下载官方模型。 str None wireless_table_structure_recognition_model_name -无线表格结构识别模型的名称。如果设置为None, 将会使用产线默认模型。 +无线表格结构识别模型的名称。如果设置为None,将会使用产线默认模型。 str None wireless_table_structure_recognition_model_dir -无线表格结构识别模型的目录路径。如果设置为None, 将会下载官方模型。 +无线表格结构识别模型的目录路径。如果设置为None,将会下载官方模型。 str None wired_table_cells_detection_model_name -有线表格单元检测模型的名称。如果设置为None, 将会使用产线默认模型。 +有线表格单元检测模型的名称。如果设置为None,将会使用产线默认模型。 str None wired_table_cells_detection_model_dir -有线表格单元检测模型的目录路径。如果设置为None, 将会下载官方模型。 +有线表格单元检测模型的目录路径。如果设置为None,将会下载官方模型。 str None wireless_table_cells_detection_model_name -无线表格单元检测模型的名称。如果设置为None, 将会使用产线默认模型。 +无线表格单元检测模型的名称。如果设置为None,将会使用产线默认模型。 str None wireless_table_cells_detection_model_dir -无线表格单元检测模型的目录路径。如果设置为None, 将会下载官方模型。 +无线表格单元检测模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_orientation_classify_model_name -文档方向分类模型的名称。如果设置为None, 将会使用产线默认模型。 +文档方向分类模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_orientation_classify_model_dir -文档方向分类模型的目录路径。如果设置为None, 将会下载官方模型。 +文档方向分类模型的目录路径。如果设置为None,将会下载官方模型。 str None doc_unwarping_model_name -文本图像矫正模型的名称。如果设置为None, 将会使用产线默认模型。 +文本图像矫正模型的名称。如果设置为None,将会使用产线默认模型。 str None doc_unwarping_model_dir -文本图像矫正模型的目录路径。如果设置为None, 将会下载官方模型。 +文本图像矫正模型的目录路径。如果设置为None,将会下载官方模型。 str None text_detection_model_name -文本检测模型的名称。如果设置为None, 将会使用产线默认模型。 +文本检测模型的名称。如果设置为None,将会使用产线默认模型。 str None text_detection_model_dir -文本检测模型的目录路径。如果设置为None, 将会下载官方模型。 +文本检测模型的目录路径。如果设置为None,将会下载官方模型。 str None @@ -1278,7 +1292,7 @@ for res in output: 文本检测的图像边长限制。
    • int:大于 0 的任意整数;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 960
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 960
    int @@ -1288,8 +1302,8 @@ for res in output: text_det_limit_type 文本检测的图像边长限制类型。
      -
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化为 max
    • +
    • str:支持 minmaxmin 表示保证图像最短边不小于 det_limit_side_lenmax 表示保证图像最长边不大于 limit_side_len
    • +
    • None:如果设置为None,将默认使用产线初始化的该参数值,初始化为 max
    str @@ -1299,8 +1313,8 @@ for res in output: text_det_thresh 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.3
    +
  • float:大于 0 的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.3
  • float None @@ -1309,8 +1323,8 @@ for res in output: text_det_box_thresh 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.6
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.6
  • float None @@ -1319,27 +1333,27 @@ for res in output: text_det_unclip_ratio 文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 2.0
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 2.0
  • float None text_recognition_model_name -文本识别模型的名称。如果设置为None, 将会使用产线默认模型。 +文本识别模型的名称。如果设置为None,将会使用产线默认模型。 str None text_recognition_model_dir -文本识别模型的目录路径。如果设置为None, 将会下载官方模型。 +文本识别模型的目录路径。如果设置为None,将会下载官方模型。 str None text_recognition_batch_size -文本识别模型的批处理大小。如果设置为None, 将默认设置批处理大小为1。 +文本识别模型的批处理大小。如果设置为None,将默认设置批处理大小为1int None @@ -1347,39 +1361,39 @@ for res in output: text_rec_score_thresh 文本识别阈值,得分大于该阈值的文本结果会被保留。
      -
    • float:大于 0 的任意浮点数 -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值 0.0。即不设阈值
    +
  • float:大于0的任意浮点数; +
  • None:如果设置为None,将默认使用产线初始化的该参数值 0.0,即不设阈值。
  • float None use_doc_orientation_classify -是否加载文档方向分类模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文档方向分类模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_doc_unwarping -是否加载文本图像矫正模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用文本图像矫正模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_layout_detection -是否加载版面检测模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用版面检测模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None use_ocr_model -是否加载OCR模块。如果设置为None, 将默认使用产线初始化的该参数值,初始化为True。 +是否加载并使用OCR模块。如果设置为None,将默认使用产线初始化的该参数值,初始化为Truebool None device -用于推理的设备。支持指定具体卡号。 +用于推理的设备。支持指定具体卡号:
    • CPU:如 cpu 表示使用 CPU 进行推理;
    • GPU:如 gpu:0 表示使用第 1 块 GPU 进行推理;
    • @@ -1387,7 +1401,7 @@ for res in output:
    • XPU:如 xpu:0 表示使用第 1 块 XPU 进行推理;
    • MLU:如 mlu:0 表示使用第 1 块 MLU 进行推理;
    • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
    • -
    • None:如果设置为 None, 将默认使用产线初始化的该参数值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备;
    • +
    • None:如果设置为None,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。
    str @@ -1415,14 +1429,14 @@ for res in output: precision 计算精度,如 fp32、fp16。 str -fp32 +"fp32" enable_mkldnn -是否启用 MKL-DNN 加速库。如果设置为None, 将默认启用。 +是否启用 MKL-DNN 加速推理。如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool -None +True cpu_threads @@ -1458,21 +1472,15 @@ for res in output: input 待预测数据,支持多种输入类型,必填
      -
    • Python Var:如 numpy.ndarray 表示的图像数据
    • -
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径)
    • -
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]
    • +
    • Python Var:如 numpy.ndarray 表示的图像数据;
    • +
    • str:如图像文件或者PDF文件的本地路径:/root/data/img.jpg如URL链接,如图像文件或PDF文件的网络URL:示例如本地目录,该目录下需包含待预测图像,如本地路径:/root/data/(当前不支持目录中包含PDF文件的预测,PDF文件需要指定到具体文件路径);
    • +
    • List:列表元素需为上述类型数据,如[numpy.ndarray, numpy.ndarray]["/root/data/img1.jpg", "/root/data/img2.jpg"]["/root/data1", "/root/data2"]。
    Python Var|str|list -device -与实例化时的参数相同。 -str -None - - use_doc_orientation_classify 是否在推理时使用文档方向分类模块。 bool @@ -1546,7 +1554,7 @@ for res in output: use_wired_table_cells_trans_to_html -是否在推理时使用有线表单元格检测结果直转HTML模式,启用则直接基于有线表单元格检测结果的几何关系构建HTML。 +是否在推理时使用有线表单元格检测结果直转HTML模式,启用则直接基于有线表单元格检测结果的几何关系构建HTML。 bool False @@ -1588,19 +1596,19 @@ for res in output: 打印结果到终端 format_json bool -是否对输出内容进行使用 JSON 缩进格式化 +是否对输出内容进行使用 JSON 缩进格式化。 True indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1608,19 +1616,19 @@ for res in output: 将结果保存为json格式的文件 save_path str -保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 +保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致。 无 indent int -指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效 +指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_jsonTrue 时有效。 4 ensure_ascii bool -控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效 +控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_jsonTrue时有效。 False @@ -1628,7 +1636,7 @@ for res in output: 将结果保存为图像格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1636,7 +1644,7 @@ for res in output: 将结果保存为xlsx格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1644,7 +1652,7 @@ for res in output: 将结果保存为html格式的文件 save_path str -保存的文件路径,支持目录或文件路径 +保存的文件路径,支持目录或文件路径。 无 @@ -1666,7 +1674,7 @@ for res in output: - `boxes`: `(List[Dict])` 版面印章区域的检测框列表,每个列表中的元素,包含以下字段 - `cls_id`: `(int)` 检测框的印章类别id - `score`: `(float)` 检测框的置信度 - - `coordinate`: `(List[float])` 检测框的四个顶点坐标,顺序为x1,y1,x2,y2表示左上角的x坐标,左上角的y坐标,右下角x坐标,右下角的y坐标 + - `coordinate`: `(List[float])` 检测框的四个顶点坐标,顺序为x1,y1,x2,y2表示左上角的x坐标,左上角的y坐标,右下角x坐标,右下角的y坐标 - `doc_preprocessor_res`: `(Dict[str, Union[str, Dict[str, bool], int]])` 文档预处理子产线的输出结果。仅当`use_doc_preprocessor=True`时存在 - `input_path`: `(Union[str, None])` 图像预处理子产线接受的图像路径,当输入为`numpy.ndarray`时,保存为`None` - `model_settings`: `(Dict)` 预处理子产线的模型配置参数 @@ -1722,13 +1730,13 @@ for res in output: - `json` 属性获取的预测结果为dict类型的数据,相关内容与调用 `save_to_json()` 方法保存的内容一致。 -- `img` 属性返回的预测结果是一个字典类型的数据。其中,键分别为 `table_res_img`、`ocr_res_img` 、`layout_res_img` 和 `preprocessed_img`,对应的值是四个 `Image.Image` 对象,按顺序分别为:表格识别结果的可视化图像、OCR 结果的可视化图像、版面区域检测结果的可视化图像、图像预处理的可视化图像。如果没有使用某个子模块,则字典中不包含对应的结果图像。 +- `img` 属性返回的预测结果是一个dict类型的数据。其中,键分别为 `table_res_img`、`ocr_res_img` 、`layout_res_img` 和 `preprocessed_img`,对应的值是四个 `Image.Image` 对象,按顺序分别为:表格识别结果的可视化图像、OCR 结果的可视化图像、版面区域检测结果的可视化图像、图像预处理的可视化图像。如果没有使用某个子模块,则dict中不包含对应的结果图像。 ## 3. 开发集成/部署 如果产线可以达到您对产线推理速度和精度的要求,您可以直接进行开发集成/部署。 -若您需要将产线直接应用在您的Python项目中,可以参考 [2.2 Python脚本方式](#22-python脚本方式集成)中的示例代码。 +若您需要将产线直接应用在您的Python项目中,可以参考 [2.2 Python脚本方式集成](#22-python)中的示例代码。 此外,PaddleOCR 也提供了其他两种部署方式,详细说明如下: @@ -1867,30 +1875,6 @@ for res in output: 否 -layoutThreshold -number | null -请参阅产线对象中 predict 方法的 layout_threshold 参数相关说明。 -否 - - -layoutNms -boolean | null -请参阅产线对象中 predict 方法的 layout_nms 参数相关说明。 -否 - - -layoutUnclipRatio -number | array | null -请参阅产线对象中 predict 方法的 layout_unclip_ratio 参数相关说明。 -否 - - -layoutMergeBboxesMode -string | null -请参阅产线对象中 predict 方法的 layout_merge_bboxes_mode 参数相关说明。 -否 - - textDetLimitSideLen integer | null 请参阅产线对象中 predict 方法的 text_det_limit_side_len 参数相关说明。 @@ -2046,7 +2030,7 @@ for i, res in enumerate(result["tableRecResults"]): 表格分类错误 表格分类模块 -链接 +链接 表格单元格定位错误 @@ -2085,3 +2069,148 @@ for i, res in enumerate(result["tableRecResults"]): + + +### 4.2 模型应用 + +当您使用私有数据集完成微调训练后,可获得本地模型权重文件,然后可以通过参数指定本地模型保存路径的方式,或者通过自定义产线配置文件的方式,使用微调后的模型权重。 + +#### 4.2.1 通过参数指定本地模型路径 + +在初始化产线对象时,通过参数指定本地模型路径。以有线表结构识别模型 SLANeXt_wired 微调后的权重的使用方法为例,示例如下: + +命令行方式: + +```bash +# 通过 --wired_table_structure_recognition_model_dir 指定本地模型路径 +paddleocr table_recognition_v2_pipeline -i ./table_recognition_v2.jpg --wired_table_structure_recognition_model_dir your_model_path + +# 假设使用 SLANeXt_wired 模型作为默认有线表结构识别模型,如果微调的不是该模型,通过 --wired_table_structure_recognition_model_name 修改模型名称 +paddleocr table_recognition_v2_pipeline -i ./table_recognition_v2.jpg --wired_table_structure_recognition_model_name SLANeXt_wired --wired_table_structure_recognition_model_dir your_model_path +``` + +脚本方式: + +```python + +from paddleocr import TableRecognitionPipelineV2 + +# 通过 wired_table_structure_recognition_model_dir 指定本地模型路径 +pipeline = TableRecognitionPipelineV2(wired_table_structure_recognition_model_dir="./your_model_path") + +# 默认使用 SLANeXt_wired 模型作为默认表格识别模型,如果微调的不是该模型,通过 wired_table_structure_recognition_model_name 修改模型名称 +# pipeline = PaddleOCR(wired_table_structure_recognition_model_name="SLANeXt_wired", wired_table_structure_recognition_model_dir="./your_model_path") + +``` + +#### 4.2.2 通过配置文件指定本地模型路径 + +1.获取产线配置文件 + +可调用 PaddleOCR 中 通用表格识别v2产线对象的 `export_paddlex_config_to_yaml` 方法,将当前产线配置导出为 YAML 文件: + +```Python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2() +pipeline.export_paddlex_config_to_yaml("TableRecognitionPipelineV2.yaml") +``` + +2.修改配置文件 + +在得到默认的产线配置文件后,将微调后模型权重的本地路径替换至产线配置文件中的对应位置即可。例如 + +```yaml +...... +SubModules: + LayoutDetection: + module_name: layout_detection + model_name: PicoDet_layout_1x_table + model_dir: null # 替换为微调后的版面区域检测模型权重路径 + + TableClassification: + module_name: table_classification + model_name: PP-LCNet_x1_0_table_cls + model_dir: null # 替换为微调后的表格分类模型权重路径 + + WiredTableStructureRecognition: + module_name: table_structure_recognition + model_name: SLANeXt_wired + model_dir: null # 替换为微调后的有线表格结构识别模型权重路径 + + WirelessTableStructureRecognition: + module_name: table_structure_recognition + model_name: SLANeXt_wireless + model_dir: null # 替换为微调后的无线表格结构识别模型权重路径 + + WiredTableCellsDetection: + module_name: table_cells_detection + model_name: RT-DETR-L_wired_table_cell_det + model_dir: null # 替换为微调后的有线表格单元格检测模型权重路径 + + WirelessTableCellsDetection: + module_name: table_cells_detection + model_name: RT-DETR-L_wireless_table_cell_det + model_dir: null # 替换为微调后的无线表格单元格检测模型权重路径 + +SubPipelines: + DocPreprocessor: + pipeline_name: doc_preprocessor + use_doc_orientation_classify: True + use_doc_unwarping: True + SubModules: + DocOrientationClassify: + module_name: doc_text_orientation + model_name: PP-LCNet_x1_0_doc_ori + model_dir: null # 替换为微调后的文档图像方向分类模型权重路径 + + DocUnwarping: + module_name: image_unwarping + model_name: UVDoc + model_dir: null + + GeneralOCR: + pipeline_name: OCR + text_type: general + use_doc_preprocessor: False + use_textline_orientation: False + SubModules: + TextDetection: + module_name: text_detection + model_name: PP-OCRv5_server_det + model_dir: null # 替换为微调后的文本检测模型权重路径 + limit_side_len: 960 + limit_type: max + max_side_limit: 4000 + thresh: 0.3 + box_thresh: 0.4 + unclip_ratio: 1.5 + + TextRecognition: + module_name: text_recognition + model_name: PP-OCRv5_server_rec + model_dir: null # 替换为微调后文本识别的模型权重路径 + batch_size: 1 + score_thresh: 0 +...... +``` + +在产线配置文件中,不仅包含 PaddleOCR CLI 和 Python API 支持的参数,还可进行更多高级配置,具体信息可在 [PaddleX模型产线使用概览](https://paddlepaddle.github.io/PaddleX/3.0/pipeline_usage/pipeline_develop_guide.html) 中找到对应的产线使用教程,参考其中的详细说明,根据需求调整各项配置。 + +3.在 CLI 中加载产线配置文件 + +在修改完成配置文件后,通过命令行的 --paddlex_config 参数指定修改后的产线配置文件的路径,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```bash +paddleocr table_recognition_v2_pipeline --paddlex_config PaddleOCR.yaml ... +``` + +4.在 Python API 中加载产线配置文件 + +初始化产线对象时,可通过 paddlex_config 参数传入 PaddleX 产线配置文件路径或配置dict,PaddleOCR 会读取其中的内容作为产线配置。示例如下: + +```python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2(paddlex_config="TableRecognitionPipelineV2.yaml") +``` diff --git a/mkdocs.yml b/mkdocs.yml index e1e9c19e7937a4fd64bf2d93913a0429b54ebc56..d217ea9c1d6c5c2b1c8d13cea8382a613304cf5d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -167,6 +167,9 @@ plugins: 社区贡献: Community Contribution 附录: Appendix 配置 PaddleOCR 推理包日志系统: Configure the logging system for the PaddleOCR inference package + 模块概述: Module Overview + 产线概述: Pipeline Overview + 基于Python或C++预测引擎推理: Python and CPP Inference # - locale: ja # name: 日本語 # site_name: PaddleOCR ドキュメント @@ -274,7 +277,9 @@ nav: - 获取onnx模型: version3.x/deployment/obtaining_onnx_models.md - 端侧部署: version3.x/deployment/on_device_deployment.md - 服务化部署: version3.x/deployment/serving.md + - 基于Python或C++预测引擎推理: version3.x/deployment/python_and_cpp_infer.md - 模块列表: + - 模块概述: version3.x/module_usage/module_overview.md - 文档图像方向分类模块: version3.x/module_usage/doc_img_orientation_classification.md - 文档类视觉语言模型模块: version3.x/module_usage/doc_vlm.md - 公式识别模块: version3.x/module_usage/formula_recognition.md @@ -285,9 +290,10 @@ nav: - 表格结构识别模块: version3.x/module_usage/table_structure_recognition.md - 文本检测模块: version3.x/module_usage/text_detection.md - 文本图像矫正模块: version3.x/module_usage/text_image_unwarping.md - - 文本行方向分类模块: version3.x/module_usage/text_line_orientation_classification.md + - 文本行方向分类模块: version3.x/module_usage/textline_orientation_classification.md - 文本识别模块: version3.x/module_usage/text_recognition.md - 产线列表: + - 产线概述: version3.x/pipeline_usage/pipeline_overview.md - 公式识别产线: version3.x/pipeline_usage/formula_recognition.md - 文档图像预处理产线: version3.x/pipeline_usage/doc_preprocessor.md - 文档理解产线: version3.x/pipeline_usage/doc_understanding.md @@ -301,6 +307,8 @@ nav: - PaddleOCR 与 PaddleX: version3.x/paddleocr_and_paddlex.md - PaddleOCR 3.x 升级说明: update/upgrade_notes.md - 配置 PaddleOCR 推理包日志系统: version3.x/logging.md + - 产线并行推理: version3.x/pipeline_usage/instructions/parallel_inference.md + - 低代码全流程开发: - 概述: version3.x/paddlex/overview.md - 快速开始: version3.x/paddlex/quick_start.md diff --git a/paddleocr/_cli.py b/paddleocr/_cli.py index 5280a252b6c0a763f7120bf4a63ca05d39a04ba1..aedbc87aa053e783c5c3af0d20a90f8f0dcfbef9 100644 --- a/paddleocr/_cli.py +++ b/paddleocr/_cli.py @@ -97,14 +97,16 @@ def _register_install_hpi_deps_command(subparsers): subparser.set_defaults(executor=_install_hpi_deps) -def _parse_args(): +def _get_parser(): parser = argparse.ArgumentParser(prog="paddleocr") - parser.add_argument("--version", action="version", version=f"%(prog)s {version}") + parser.add_argument( + "-v", "--version", action="version", version=f"%(prog)s {version}" + ) subparsers = parser.add_subparsers(dest="subcommand") _register_pipelines(subparsers) _register_models(subparsers) _register_install_hpi_deps_command(subparsers) - return parser.parse_args() + return parser def _execute(args): @@ -114,5 +116,9 @@ def _execute(args): def main(): logger.setLevel(logging.INFO) warnings.filterwarnings("default", category=CLIDeprecationWarning) - args = _parse_args() + parser = _get_parser() + args = parser.parse_args() + if args.subcommand is None: + parser.print_usage(sys.stderr) + sys.exit(2) _execute(args) diff --git a/paddleocr/_common_args.py b/paddleocr/_common_args.py index 50988c91bce1e321c272c47c30c53962b643417b..22fe30588c1a14e8005db504877072c9d0267f15 100644 --- a/paddleocr/_common_args.py +++ b/paddleocr/_common_args.py @@ -77,15 +77,10 @@ def prepare_common_init_args(model_name, common_args): pp_option.run_mode = "trt_fp16" elif device_type == "cpu": enable_mkldnn = common_args["enable_mkldnn"] - if enable_mkldnn is None: - from paddle.inference import Config - - if hasattr(Config, "set_mkldnn_cache_capacity"): - enable_mkldnn = True - else: - enable_mkldnn = False if enable_mkldnn: pp_option.run_mode = "mkldnn" + else: + pp_option.run_mode = "paddle" pp_option.cpu_threads = common_args["cpu_threads"] init_kwargs["pp_option"] = pp_option @@ -132,7 +127,7 @@ def add_common_cli_opts(parser, *, default_enable_hpi, allow_multiple_devices): "--enable_mkldnn", type=str2bool, default=DEFAULT_ENABLE_MKLDNN, - help="Enable oneDNN (formerly MKL-DNN) acceleration for inference. By default, oneDNN will be used when available, except for models and pipelines that have known oneDNN issues.", + help="Enable MKL-DNN acceleration for inference. If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set.", ) parser.add_argument( "--cpu_threads", diff --git a/paddleocr/_constants.py b/paddleocr/_constants.py index 58aec8a2bd3f1416025c11c6cc2701fc8fcd5285..232f45d920380ece0368e931ddb4d0988d53f7b0 100644 --- a/paddleocr/_constants.py +++ b/paddleocr/_constants.py @@ -16,6 +16,6 @@ DEFAULT_DEVICE = None DEFAULT_USE_TENSORRT = False DEFAULT_MIN_SUBGRAPH_SIZE = 3 DEFAULT_PRECISION = "fp32" -DEFAULT_ENABLE_MKLDNN = None +DEFAULT_ENABLE_MKLDNN = True DEFAULT_CPU_THREADS = 10 SUPPORTED_PRECISION_LIST = ["fp32", "fp16"] diff --git a/paddleocr/_mkldnn_blocklists.py b/paddleocr/_mkldnn_blocklists.py deleted file mode 100644 index 4bd4cf7867b1e8747a5efe09ee3966cc1678a0ce..0000000000000000000000000000000000000000 --- a/paddleocr/_mkldnn_blocklists.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -PIPELINE_MKLDNN_BLOCKLIST = [ - "formula_recognition", - "table_recognition_v2", - "PP-StructureV3", -] - -MODEL_MKLDNN_BLOCKLIST = [ - "SLANeXt_wired", - "SLANeXt_wireless", - "LaTeX_OCR_rec", - "PP-FormulaNet-L", - "PP-FormulaNet-S", - "UniMERNet", - "PP-FormulaNet_plus-L", - "PP-FormulaNet_plus-M", - "PP-FormulaNet_plus-S", -] diff --git a/paddleocr/_models/__init__.py b/paddleocr/_models/__init__.py index 543114fcab038d9e2f8ad8da98643b2f5be3fb5c..bc94674f7a36285165d757e613bda2b279bc9af5 100644 --- a/paddleocr/_models/__init__.py +++ b/paddleocr/_models/__init__.py @@ -22,7 +22,7 @@ from .table_classification import TableClassification from .table_structure_recognition import TableStructureRecognition from .text_detection import TextDetection from .text_image_unwarping import TextImageUnwarping -from .text_line_orientation_classification import TextLineOrientationClassification +from .textline_orientation_classification import TextLineOrientationClassification from .text_recognition import TextRecognition __all__ = [ diff --git a/paddleocr/_models/base.py b/paddleocr/_models/base.py index 780eeb8757271773b50c585aaa5525e8ea96f261..d63fccda3e025c4fb470d6975b0c2acf3076ba56 100644 --- a/paddleocr/_models/base.py +++ b/paddleocr/_models/base.py @@ -22,8 +22,6 @@ from .._common_args import ( parse_common_args, prepare_common_init_args, ) -from .._mkldnn_blocklists import MODEL_MKLDNN_BLOCKLIST -from .._utils.logging import logger _DEFAULT_ENABLE_HPI = False @@ -41,14 +39,6 @@ class PaddleXPredictorWrapper(metaclass=abc.ABCMeta): model_name if model_name is not None else self.default_model_name ) self._model_dir = model_dir - if ( - common_args.get("enable_mkldnn", None) is None - and self._model_name in MODEL_MKLDNN_BLOCKLIST - ): - logger.warning( - f"oneDNN will be disabled for the {repr(self._model_name)} model." - ) - common_args["enable_mkldnn"] = False self._common_args = parse_common_args( common_args, default_enable_hpi=_DEFAULT_ENABLE_HPI ) diff --git a/paddleocr/_models/layout_detection.py b/paddleocr/_models/layout_detection.py index 63c072191ec8387753f63c96a725f6bba5961c1c..14427fba8bc77f085b5eb2b023d67d547fb47f3a 100644 --- a/paddleocr/_models/layout_detection.py +++ b/paddleocr/_models/layout_detection.py @@ -21,7 +21,7 @@ from ._object_detection import ( class LayoutDetection(ObjectDetection): @property def default_model_name(self): - return "PP-DocLayout-L" + return "PP-DocLayout_plus-L" @classmethod def get_cli_subcommand_executor(cls): diff --git a/paddleocr/_models/text_detection.py b/paddleocr/_models/text_detection.py index 084ca812738776c0d313e61e324f7545d5260ce3..809dbd232406315b46524e32d1587405cc928c9e 100644 --- a/paddleocr/_models/text_detection.py +++ b/paddleocr/_models/text_detection.py @@ -24,7 +24,7 @@ from ._text_detection import TextDetectionMixin, TextDetectionSubcommandExecutor class TextDetection(TextDetectionMixin, PaddleXPredictorWrapper): @property def default_model_name(self): - return "PP-OCRv4_mobile_det" + return "PP-OCRv5_server_det" @classmethod def get_cli_subcommand_executor(cls): diff --git a/paddleocr/_models/text_recognition.py b/paddleocr/_models/text_recognition.py index 1d359ecb987a1c1079d103c35c4e140c33394528..4f96f8e84de2febcccb69496886ca2bf6bc73fdc 100644 --- a/paddleocr/_models/text_recognition.py +++ b/paddleocr/_models/text_recognition.py @@ -34,7 +34,7 @@ class TextRecognition(PaddleXPredictorWrapper): @property def default_model_name(self): - return "PP-OCRv4_mobile_rec" + return "PP-OCRv5_server_rec" @classmethod def get_cli_subcommand_executor(cls): diff --git a/paddleocr/_models/text_line_orientation_classification.py b/paddleocr/_models/textline_orientation_classification.py similarity index 95% rename from paddleocr/_models/text_line_orientation_classification.py rename to paddleocr/_models/textline_orientation_classification.py index db347ca134ff23fce8f930ab37fe7033c7b61c8b..908c0ddf1658018a627bf468a86c1e5747f5887e 100644 --- a/paddleocr/_models/text_line_orientation_classification.py +++ b/paddleocr/_models/textline_orientation_classification.py @@ -33,7 +33,7 @@ class TextLineOrientationClassificationSubcommandExecutor( ): @property def subparser_name(self): - return "text_line_orientation_classification" + return "textline_orientation_classification" @property def wrapper_cls(self): diff --git a/paddleocr/_pipelines/base.py b/paddleocr/_pipelines/base.py index 3ac8e4c591cfe06111b6390a34d3c361394cbe92..5ce69cd1973cb97f218e48f54c226792cb02e87f 100644 --- a/paddleocr/_pipelines/base.py +++ b/paddleocr/_pipelines/base.py @@ -25,8 +25,6 @@ from .._common_args import ( parse_common_args, prepare_common_init_args, ) -from .._mkldnn_blocklists import PIPELINE_MKLDNN_BLOCKLIST -from .._utils.logging import logger _DEFAULT_ENABLE_HPI = None @@ -58,14 +56,6 @@ class PaddleXPipelineWrapper(metaclass=abc.ABCMeta): ): super().__init__() self._paddlex_config = paddlex_config - if ( - common_args.get("enable_mkldnn", None) is None - and self._paddlex_pipeline_name in PIPELINE_MKLDNN_BLOCKLIST - ): - logger.warning( - f"oneDNN will be disabled for the {repr(self._paddlex_pipeline_name)} pipeline." - ) - common_args["enable_mkldnn"] = False self._common_args = parse_common_args( common_args, default_enable_hpi=_DEFAULT_ENABLE_HPI ) diff --git a/paddleocr/_pipelines/doc_preprocessor.py b/paddleocr/_pipelines/doc_preprocessor.py index afb7dfea16b03c12ac0325424fc415d1ac0d5206..b8c34df37737aeee6ca4ede29404a6d8594ef0ee 100644 --- a/paddleocr/_pipelines/doc_preprocessor.py +++ b/paddleocr/_pipelines/doc_preprocessor.py @@ -133,12 +133,12 @@ class DocPreprocessorCLISubcommandExecutor(PipelineCLISubcommandExecutor): subparser.add_argument( "--use_doc_orientation_classify", type=str2bool, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, - help="Whether to use the text image unwarping model.", + help="Whether to use text image unwarping.", ) def execute_with_args(self, args): diff --git a/paddleocr/_pipelines/ocr.py b/paddleocr/_pipelines/ocr.py index b38dc5b5b8f9cc4c80907a09d79ee3376091aeb0..d53d0d20844ad1ca2383fc8f2580ba995234e795 100644 --- a/paddleocr/_pipelines/ocr.py +++ b/paddleocr/_pipelines/ocr.py @@ -17,6 +17,7 @@ # maintainability? import sys +import warnings from .._utils.cli import ( add_simple_inference_args, @@ -43,8 +44,8 @@ _DEPRECATED_PARAM_NAME_MAPPING = { "rec_model_dir": "text_recognition_model_dir", "rec_batch_num": "text_recognition_batch_size", "use_angle_cls": "use_textline_orientation", - "cls_model_dir": "text_line_orientation_model_dir", - "cls_batch_num": "text_line_orientation_batch_size", + "cls_model_dir": "textline_orientation_model_dir", + "cls_batch_num": "textline_orientation_batch_size", } _SUPPORTED_OCR_VERSIONS = ["PP-OCRv3", "PP-OCRv4", "PP-OCRv5"] @@ -60,9 +61,9 @@ class PaddleOCR(PaddleXPipelineWrapper): doc_unwarping_model_dir=None, text_detection_model_name=None, text_detection_model_dir=None, - text_line_orientation_model_name=None, - text_line_orientation_model_dir=None, - text_line_orientation_batch_size=None, + textline_orientation_model_name=None, + textline_orientation_model_dir=None, + textline_orientation_batch_size=None, text_recognition_model_name=None, text_recognition_model_dir=None, text_recognition_batch_size=None, @@ -86,7 +87,17 @@ class PaddleOCR(PaddleXPipelineWrapper): f"Invalid OCR version: {ocr_version}. Supported values are {_SUPPORTED_OCR_VERSIONS}." ) - if text_detection_model_dir is None and text_recognition_model_dir is None: + if all( + map( + lambda p: p is None, + ( + text_detection_model_name, + text_detection_model_dir, + text_recognition_model_name, + text_recognition_model_dir, + ), + ) + ): if lang is not None or ocr_version is not None: det_model_name, rec_model_name = self._get_ocr_model_names( lang, ocr_version @@ -97,6 +108,12 @@ class PaddleOCR(PaddleXPipelineWrapper): ) text_detection_model_name = det_model_name text_recognition_model_name = rec_model_name + else: + if lang is not None or ocr_version is not None: + warnings.warn( + "`lang` and `ocr_version` will be ignored when model names or model directories are not `None`.", + stacklevel=2, + ) params = { "doc_orientation_classify_model_name": doc_orientation_classify_model_name, @@ -105,9 +122,9 @@ class PaddleOCR(PaddleXPipelineWrapper): "doc_unwarping_model_dir": doc_unwarping_model_dir, "text_detection_model_name": text_detection_model_name, "text_detection_model_dir": text_detection_model_dir, - "text_line_orientation_model_name": text_line_orientation_model_name, - "text_line_orientation_model_dir": text_line_orientation_model_dir, - "text_line_orientation_batch_size": text_line_orientation_batch_size, + "textline_orientation_model_name": textline_orientation_model_name, + "textline_orientation_model_dir": textline_orientation_model_dir, + "textline_orientation_batch_size": textline_orientation_batch_size, "text_recognition_model_name": text_recognition_model_name, "text_recognition_model_dir": text_recognition_model_dir, "text_recognition_batch_size": text_recognition_batch_size, @@ -232,13 +249,13 @@ class PaddleOCR(PaddleXPipelineWrapper): "text_detection_model_dir" ], "SubModules.TextLineOrientation.model_name": self._params[ - "text_line_orientation_model_name" + "textline_orientation_model_name" ], "SubModules.TextLineOrientation.model_dir": self._params[ - "text_line_orientation_model_dir" + "textline_orientation_model_dir" ], "SubModules.TextLineOrientation.batch_size": self._params[ - "text_line_orientation_batch_size" + "textline_orientation_batch_size" ], "SubModules.TextRecognition.model_name": self._params[ "text_recognition_model_name" @@ -278,13 +295,118 @@ class PaddleOCR(PaddleXPipelineWrapper): return create_config_from_structure(STRUCTURE) def _get_ocr_model_names(self, lang, ppocr_version): + LATIN_LANGS = [ + "af", + "az", + "bs", + "cs", + "cy", + "da", + "de", + "es", + "et", + "fr", + "ga", + "hr", + "hu", + "id", + "is", + "it", + "ku", + "la", + "lt", + "lv", + "mi", + "ms", + "mt", + "nl", + "no", + "oc", + "pi", + "pl", + "pt", + "ro", + "rs_latin", + "sk", + "sl", + "sq", + "sv", + "sw", + "tl", + "tr", + "uz", + "vi", + "french", + "german", + ] + ARABIC_LANGS = ["ar", "fa", "ug", "ur"] + CYRILLIC_LANGS = [ + "ru", + "rs_cyrillic", + "be", + "bg", + "uk", + "mn", + "abq", + "ady", + "kbd", + "ava", + "dar", + "inh", + "che", + "lbe", + "lez", + "tab", + ] + DEVANAGARI_LANGS = [ + "hi", + "mr", + "ne", + "bh", + "mai", + "ang", + "bho", + "mah", + "sck", + "new", + "gom", + "sa", + "bgc", + ] + SPECIFIC_LANGS = [ + "ch", + "en", + "korean", + "japan", + "chinese_cht", + "te", + "ka", + "ta", + ] + if lang is None: lang = "ch" + if ppocr_version is None: - ppocr_version = "PP-OCRv5" + if lang in ("ch", "chinese_cht", "en", "japan"): + ppocr_version = "PP-OCRv5" + elif lang in ( + LATIN_LANGS + + ARABIC_LANGS + + CYRILLIC_LANGS + + DEVANAGARI_LANGS + + SPECIFIC_LANGS + ): + ppocr_version = "PP-OCRv3" + else: + # Unknown language specified + return None, None if ppocr_version == "PP-OCRv5": - return "PP-OCRv5_mobile_det", "PP-OCRv5_mobile_rec" + if lang in ("ch", "chinese_cht", "en", "japan"): + return "PP-OCRv5_server_det", "PP-OCRv5_server_rec" + else: + return None, None elif ppocr_version == "PP-OCRv4": if lang == "ch": return "PP-OCRv4_mobile_det", "PP-OCRv4_mobile_rec" @@ -294,84 +416,6 @@ class PaddleOCR(PaddleXPipelineWrapper): return None, None else: # PP-OCRv3 - LATIN_LANGS = [ - "af", - "az", - "bs", - "cs", - "cy", - "da", - "de", - "es", - "et", - "fr", - "ga", - "hr", - "hu", - "id", - "is", - "it", - "ku", - "la", - "lt", - "lv", - "mi", - "ms", - "mt", - "nl", - "no", - "oc", - "pi", - "pl", - "pt", - "ro", - "rs_latin", - "sk", - "sl", - "sq", - "sv", - "sw", - "tl", - "tr", - "uz", - "vi", - "french", - "german", - ] - ARABIC_LANGS = ["ar", "fa", "ug", "ur"] - CYRILLIC_LANGS = [ - "ru", - "rs_cyrillic", - "be", - "bg", - "uk", - "mn", - "abq", - "ady", - "kbd", - "ava", - "dar", - "inh", - "che", - "lbe", - "lez", - "tab", - ] - DEVANAGARI_LANGS = [ - "hi", - "mr", - "ne", - "bh", - "mai", - "ang", - "bho", - "mah", - "sck", - "new", - "gom", - "sa", - "bgc", - ] rec_lang = None if lang in LATIN_LANGS: rec_lang = "latin" @@ -382,17 +426,9 @@ class PaddleOCR(PaddleXPipelineWrapper): elif lang in DEVANAGARI_LANGS: rec_lang = "devanagari" else: - if lang in [ - "ch", - "en", - "korean", - "japan", - "chinese_cht", - "te", - "ka", - "ta", - ]: + if lang in SPECIFIC_LANGS: rec_lang = lang + rec_model_name = None if rec_lang == "ch": rec_model_name = "PP-OCRv3_mobile_rec" @@ -440,17 +476,17 @@ class PaddleOCRCLISubcommandExecutor(PipelineCLISubcommandExecutor): help="Path to the text detection model directory.", ) subparser.add_argument( - "--text_line_orientation_model_name", + "--textline_orientation_model_name", type=str, help="Name of the text line orientation classification model.", ) subparser.add_argument( - "--text_line_orientation_model_dir", + "--textline_orientation_model_dir", type=str, help="Path to the text line orientation classification model directory.", ) subparser.add_argument( - "--text_line_orientation_batch_size", + "--textline_orientation_batch_size", type=int, help="Batch size for the text line orientation classification model.", ) @@ -472,17 +508,17 @@ class PaddleOCRCLISubcommandExecutor(PipelineCLISubcommandExecutor): subparser.add_argument( "--use_doc_orientation_classify", type=str2bool, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, - help="Whether to use the text image unwarping model.", + help="Whether to use text image unwarping.", ) subparser.add_argument( "--use_textline_orientation", type=str2bool, - help="Whether to use the text line orientation classification model.", + help="Whether to use text line orientation classification.", ) subparser.add_argument( "--text_det_limit_side_len", diff --git a/paddleocr/_pipelines/pp_chatocrv4_doc.py b/paddleocr/_pipelines/pp_chatocrv4_doc.py index 0b18115bc9a3d34259a10395ca77ae63afbbc8f8..b53725e59f5936e8e9724bb18942649384966f07 100644 --- a/paddleocr/_pipelines/pp_chatocrv4_doc.py +++ b/paddleocr/_pipelines/pp_chatocrv4_doc.py @@ -31,6 +31,9 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): doc_unwarping_model_dir=None, text_detection_model_name=None, text_detection_model_dir=None, + textline_orientation_model_name=None, + textline_orientation_model_dir=None, + textline_orientation_batch_size=None, text_recognition_model_name=None, text_recognition_model_dir=None, text_recognition_batch_size=None, @@ -43,6 +46,7 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): seal_text_recognition_batch_size=None, use_doc_orientation_classify=None, use_doc_unwarping=None, + use_textline_orientation=None, use_seal_recognition=None, use_table_recognition=None, layout_threshold=None, @@ -83,6 +87,7 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): *, use_doc_orientation_classify=None, use_doc_unwarping=None, + use_textline_orientation=None, use_seal_recognition=None, use_table_recognition=None, layout_threshold=None, @@ -107,6 +112,7 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): input, use_doc_orientation_classify=use_doc_orientation_classify, use_doc_unwarping=use_doc_unwarping, + use_textline_orientation=use_textline_orientation, use_seal_recognition=use_seal_recognition, use_table_recognition=use_table_recognition, layout_threshold=layout_threshold, @@ -134,6 +140,7 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): *, use_doc_orientation_classify=None, use_doc_unwarping=None, + use_textline_orientation=None, use_seal_recognition=None, use_table_recognition=None, layout_threshold=None, @@ -159,6 +166,7 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): input, use_doc_orientation_classify=use_doc_orientation_classify, use_doc_unwarping=use_doc_unwarping, + use_textline_orientation=use_textline_orientation, use_seal_recognition=use_seal_recognition, use_table_recognition=use_table_recognition, layout_threshold=layout_threshold, @@ -280,6 +288,15 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[ "text_detection_model_dir" ], + "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[ + "textline_orientation_model_name" + ], + "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[ + "textline_orientation_model_dir" + ], + "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[ + "textline_orientation_batch_size" + ], "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[ "text_recognition_model_name" ], @@ -316,6 +333,9 @@ class PPChatOCRv4Doc(PaddleXPipelineWrapper): "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.use_doc_unwarping": self._params[ "use_doc_unwarping" ], + "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.use_textline_orientation": self._params[ + "use_textline_orientation" + ], "SubPipelines.LayoutParser.use_seal_recognition": self._params[ "use_seal_recognition" ], @@ -452,6 +472,21 @@ class PPChatOCRv4DocCLISubcommandExecutor(PipelineCLISubcommandExecutor): type=str, help="Path to the text detection model directory.", ) + subparser.add_argument( + "--textline_orientation_model_name", + type=str, + help="Name of the text line orientation classification model.", + ) + subparser.add_argument( + "--textline_orientation_model_dir", + type=str, + help="Path to the text line orientation classification model directory.", + ) + subparser.add_argument( + "--textline_orientation_batch_size", + type=int, + help="Batch size for the text line orientation classification model.", + ) subparser.add_argument( "--text_recognition_model_name", type=str, @@ -505,12 +540,17 @@ class PPChatOCRv4DocCLISubcommandExecutor(PipelineCLISubcommandExecutor): subparser.add_argument( "--use_doc_orientation_classify", type=str2bool, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, - help="Whether to use the text image unwarping model.", + help="Whether to use text image unwarping.", + ) + subparser.add_argument( + "--use_textline_orientation", + type=str2bool, + help="Whether to use text line orientation classification.", ) subparser.add_argument( "--use_seal_recognition", diff --git a/paddleocr/_pipelines/pp_structurev3.py b/paddleocr/_pipelines/pp_structurev3.py index 112c22e6cb01a66fa903f7c2ad93488cdd82fe17..5bc93516eb5b743f9837b68d02a8951c959ac8a6 100644 --- a/paddleocr/_pipelines/pp_structurev3.py +++ b/paddleocr/_pipelines/pp_structurev3.py @@ -80,6 +80,7 @@ class PPStructureV3(PaddleXPipelineWrapper): formula_recognition_batch_size=None, use_doc_orientation_classify=None, use_doc_unwarping=None, + use_textline_orientation=None, use_seal_recognition=None, use_table_recognition=None, use_formula_recognition=None, @@ -240,6 +241,9 @@ class PPStructureV3(PaddleXPipelineWrapper): ) ) + def concatenate_markdown_pages(self, markdown_list): + return self.paddlex_pipeline.concatenate_markdown_pages(markdown_list) + @classmethod def get_cli_subcommand_executor(cls): return PPStructureV3CLISubcommandExecutor() @@ -252,9 +256,14 @@ class PPStructureV3(PaddleXPipelineWrapper): "SubPipelines.DocPreprocessor.use_doc_unwarping": self._params[ "use_doc_unwarping" ], + "SubPipelines.GeneralOCR.use_textline_orientation": self._params[ + "use_textline_orientation" + ], "use_seal_recognition": self._params["use_seal_recognition"], "use_table_recognition": self._params["use_table_recognition"], "use_formula_recognition": self._params["use_formula_recognition"], + "use_chart_recognition": self._params["use_chart_recognition"], + "use_region_detection": self._params["use_region_detection"], "SubModules.LayoutDetection.model_name": self._params[ "layout_detection_model_name" ], @@ -536,17 +545,17 @@ class PPStructureV3CLISubcommandExecutor(PipelineCLISubcommandExecutor): subparser.add_argument( "--textline_orientation_model_name", type=str, - help="Name of the text tetextline orientation.", + help="Name of the text line orientation classification model.", ) subparser.add_argument( "--textline_orientation_model_dir", type=str, - help="Path to the text tetextline orientation directory.", + help="Path to the text line orientation classification directory.", ) subparser.add_argument( "--textline_orientation_batch_size", type=int, - help="Batch size for the tetextline orientation model.", + help="Batch size for the text line orientation classification model.", ) subparser.add_argument( "--text_recognition_model_name", @@ -696,13 +705,18 @@ class PPStructureV3CLISubcommandExecutor(PipelineCLISubcommandExecutor): "--use_doc_orientation_classify", type=str2bool, default=False, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, default=False, - help="Whether to use the text image unwarping model.", + help="Whether to use text image unwarping.", + ) + subparser.add_argument( + "--use_textline_orientation", + type=str2bool, + help="Whether to use text line orientation classification.", ) subparser.add_argument( "--use_seal_recognition", diff --git a/paddleocr/_pipelines/seal_recognition.py b/paddleocr/_pipelines/seal_recognition.py index 13b566684c4d11a4fb7db0d013b37249a1f4a341..8d1b11bf82a897f241e784da0a3881908c0de4e8 100644 --- a/paddleocr/_pipelines/seal_recognition.py +++ b/paddleocr/_pipelines/seal_recognition.py @@ -306,17 +306,17 @@ class SealRecognitionCLISubcommandExecutor(PipelineCLISubcommandExecutor): subparser.add_argument( "--use_doc_orientation_classify", type=str2bool, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, - help="Whether to use the document image unwarping model.", + help="Whether to use document image unwarping.", ) subparser.add_argument( "--use_layout_detection", type=str2bool, - help="Whether to use the layout detection model.", + help="Whether to use layout detection.", ) subparser.add_argument( "--layout_threshold", diff --git a/paddleocr/_pipelines/table_recognition_v2.py b/paddleocr/_pipelines/table_recognition_v2.py index a9315a20dbb569cbee263dfef9b097796f9949d5..5a1f24b50bc2942c9863945b52f9ac6c44f3f0c7 100644 --- a/paddleocr/_pipelines/table_recognition_v2.py +++ b/paddleocr/_pipelines/table_recognition_v2.py @@ -411,12 +411,12 @@ class TableRecognitionPipelineV2CLISubcommandExecutor(PipelineCLISubcommandExecu subparser.add_argument( "--use_doc_orientation_classify", type=str2bool, - help="Whether to use the document image orientation classification model.", + help="Whether to use document image orientation classification.", ) subparser.add_argument( "--use_doc_unwarping", type=str2bool, - help="Whether to use the text image unwarping model.", + help="Whether to use text image unwarping.", ) subparser.add_argument( "--use_layout_detection", diff --git a/paddleocr/_utils/cli.py b/paddleocr/_utils/cli.py index 059a426c53f2fad37c90e772f5f01f3339f39412..da47c42f90efd60f325ac4951032c26446dc3fa2 100644 --- a/paddleocr/_utils/cli.py +++ b/paddleocr/_utils/cli.py @@ -46,13 +46,23 @@ def add_simple_inference_args(subparser, *, input_help=None): ) -def perform_simple_inference(wrapper_cls, params): +def perform_simple_inference(wrapper_cls, params, predict_param_names=None): + params = params.copy() + input_ = params.pop("input") save_path = params.pop("save_path") - wrapper = wrapper_cls(**params) + if predict_param_names is not None: + predict_params = {} + for name in predict_param_names: + predict_params[name] = params.pop(name) + else: + predict_params = {} + init_params = params + + wrapper = wrapper_cls(**init_params) - result = wrapper.predict_iter(input_) + result = wrapper.predict_iter(input_, **predict_params) t1 = time.time() for i, res in enumerate(result): diff --git a/pyproject.toml b/pyproject.toml index bfdc3f14039ba1192e2aca26978b4a3118779460..71b56352860ad17540f6b341f951a877e140218a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools==72.1.0"] +requires = ["setuptools==72.1.0", "wheel", "setuptools_scm"] build-backend = "setuptools.build_meta" [project] @@ -39,7 +39,7 @@ classifiers = [ "Topic :: Utilities", ] dependencies = [ - "paddlex[ocr,ie,multimodal]==3.0.0", + "paddlex[ocr,ie,multimodal]==3.0.1", "PyYAML>=6", "typing-extensions>=4.12", ] diff --git a/tests/pipelines/test_ocr.py b/tests/pipelines/test_ocr.py index 88e6f7826d1eda698fe6eb7e5c6b4545225dabb1..8bc43cb486904e832366baff30e678b7df6b30c5 100644 --- a/tests/pipelines/test_ocr.py +++ b/tests/pipelines/test_ocr.py @@ -71,6 +71,18 @@ def test_predict_params( def test_lang_and_ocr_version(): + ocr_engine = PaddleOCR(lang="ch", ocr_version="PP-OCRv5") + assert ocr_engine._params["text_detection_model_name"] == "PP-OCRv5_server_det" + assert ocr_engine._params["text_recognition_model_name"] == "PP-OCRv5_server_rec" + ocr_engine = PaddleOCR(lang="chinese_cht", ocr_version="PP-OCRv5") + assert ocr_engine._params["text_detection_model_name"] == "PP-OCRv5_server_det" + assert ocr_engine._params["text_recognition_model_name"] == "PP-OCRv5_server_rec" + ocr_engine = PaddleOCR(lang="en", ocr_version="PP-OCRv5") + assert ocr_engine._params["text_detection_model_name"] == "PP-OCRv5_server_det" + assert ocr_engine._params["text_recognition_model_name"] == "PP-OCRv5_server_rec" + ocr_engine = PaddleOCR(lang="japan", ocr_version="PP-OCRv5") + assert ocr_engine._params["text_detection_model_name"] == "PP-OCRv5_server_det" + assert ocr_engine._params["text_recognition_model_name"] == "PP-OCRv5_server_rec" ocr_engine = PaddleOCR(lang="ch", ocr_version="PP-OCRv4") assert ocr_engine._params["text_detection_model_name"] == "PP-OCRv4_mobile_det" assert ocr_engine._params["text_recognition_model_name"] == "PP-OCRv4_mobile_rec" diff --git a/tests/predictors/test_text_line_orientation_classifcation.py b/tests/predictors/test_textline_orientation_classifcation.py similarity index 68% rename from tests/predictors/test_text_line_orientation_classifcation.py rename to tests/predictors/test_textline_orientation_classifcation.py index c4e43f5ee3813baa8283e68a236fed884144add6..855fc02a60984d4c551afb9f4b0d7ba30e40a870 100644 --- a/tests/predictors/test_text_line_orientation_classifcation.py +++ b/tests/predictors/test_textline_orientation_classifcation.py @@ -6,7 +6,7 @@ from .image_classification_common import check_result_item_keys @pytest.fixture(scope="module") -def text_line_orientation_classification_predictor(): +def textline_orientation_classification_predictor(): return TextLineOrientationClassification() @@ -16,8 +16,8 @@ def text_line_orientation_classification_predictor(): TEST_DATA_DIR / "textline_rot180.jpg", ], ) -def test_predict(text_line_orientation_classification_predictor, image_path): - result = text_line_orientation_classification_predictor.predict(str(image_path)) +def test_predict(textline_orientation_classification_predictor, image_path): + result = textline_orientation_classification_predictor.predict(str(image_path)) check_simple_inference_result(result) check_result_item_keys(result[0]) diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 759d457b8ba01aeec63ad8c7d068b0d6e1a94b1e..3d6d1ac3c68814bf3fc9e4b91f56d42fa3c47ee9 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -38,7 +38,10 @@ class TextDetector(object): if os.path.exists(f"{args.det_model_dir}/inference.yml"): model_config = utility.load_config(f"{args.det_model_dir}/inference.yml") model_name = model_config.get("Global", {}).get("model_name", "") - if model_name: + if model_name and model_name not in [ + "PP-OCRv5_mobile_det", + "PP-OCRv5_server_det", + ]: raise ValueError( f"{model_name} is not supported. Please check if the model is supported by the PaddleOCR wheel." ) diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 84d8d563dd758a039b82a85240096706c7e84222..597bbffbb2dc6619fe5e774e1184ca5da7ae01fb 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -41,7 +41,10 @@ class TextRecognizer(object): if os.path.exists(f"{args.rec_model_dir}/inference.yml"): model_config = utility.load_config(f"{args.rec_model_dir}/inference.yml") model_name = model_config.get("Global", {}).get("model_name", "") - if model_name: + if model_name and model_name not in [ + "PP-OCRv5_mobile_rec", + "PP-OCRv5_server_rec", + ]: raise ValueError( f"{model_name} is not supported. Please check if the model is supported by the PaddleOCR wheel." )