自定义转换
In [ ]
已复制!
import json
import logging
import time
from pathlib import Path
import json import logging import time from pathlib import Path
In [ ]
已复制!
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption
In [ ]
已复制!
_log = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
In [ ]
已复制!
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# )
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# )
# Docling Parse without EasyOCR
# -------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
# Docling Parse with EasyOCR
# ----------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["es"]
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4, device=AcceleratorDevice.AUTO
)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# Docling Parse with EasyOCR (CPU only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.ocr_options.use_gpu = False # <-- set this.
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
# Docling Parse with ocrmac(Mac only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = OcrMacOptions()
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
###########################################################################
start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")
## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_document_tokens())
def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") ########################################################################### # The following sections contain a combination of PipelineOptions # and PDF Backends for various configurations. # Uncomment one section at the time to see the differences in the output. # PyPdfium without EasyOCR # -------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = False # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = False # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption( # pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend # ) # } # ) # PyPdfium with EasyOCR # ----------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption( # pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend # ) # } # ) # Docling Parse without EasyOCR # ------------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = False # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) # } # ) # Docling Parse with EasyOCR # ---------------------- pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.ocr_options.lang = ["es"] pipeline_options.accelerator_options = AcceleratorOptions( num_threads=4, device=AcceleratorDevice.AUTO ) doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) # Docling Parse with EasyOCR (CPU only) # ---------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.ocr_options.use_gpu = False # <-- set this. # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) # } # ) # Docling Parse with Tesseract # ---------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.ocr_options = TesseractOcrOptions() # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) # } # ) # Docling Parse with Tesseract CLI # ---------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.ocr_options = TesseractCliOcrOptions() # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) # } # ) # Docling Parse with ocrmac(Mac only) # ---------------------- # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.ocr_options = OcrMacOptions() # doc_converter = DocumentConverter( # format_options={ # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) # } # ) ########################################################################### start_time = time.time() conv_result = doc_converter.convert(input_doc_path) end_time = time.time() - start_time _log.info(f"Document converted in {end_time:.2f} seconds.") ## Export results output_dir = Path("scratch") output_dir.mkdir(parents=True, exist_ok=True) doc_filename = conv_result.input.file.stem # Export Deep Search document JSON format: with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp: fp.write(json.dumps(conv_result.document.export_to_dict())) # Export Text format: with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp: fp.write(conv_result.document.export_to_text()) # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: fp.write(conv_result.document.export_to_markdown()) # Export Document Tags format: with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp: fp.write(conv_result.document.export_to_document_tokens())
In [ ]
已复制!
if __name__ == "__main__":
main()
if __name__ == "__main__": main()