多格式转换
输入 [ ]
已复制!
import json
import logging
from pathlib import Path
import json import logging from pathlib import Path
输入 [ ]
已复制!
import yaml
import yaml
输入 [ ]
已复制!
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.document_converter import ( DocumentConverter, PdfFormatOption, WordFormatOption, ) from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
输入 [ ]
已复制!
_log = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
输入 [ ]
已复制!
def main():
input_paths = [
Path("README.md"),
Path("tests/data/html/wiki_duck.html"),
Path("tests/data/docx/word_sample.docx"),
Path("tests/data/docx/lorem_ipsum.docx"),
Path("tests/data/pptx/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/pdf/2206.01062.pdf"),
Path("tests/data/asciidoc/test_01.asciidoc"),
]
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {out_path!s}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))
with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict()))
def main(): input_paths = [ Path("README.md"), Path("tests/data/html/wiki_duck.html"), Path("tests/data/docx/word_sample.docx"), Path("tests/data/docx/lorem_ipsum.docx"), Path("tests/data/pptx/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/pdf/2206.01062.pdf"), Path("tests/data/asciidoc/test_01.asciidoc"), ] ## for defaults use: # doc_converter = DocumentConverter() ## to customize use: doc_converter = ( DocumentConverter( # all of the below is optional, has internal defaults. allowed_formats=[ InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML, InputFormat.PPTX, InputFormat.ASCIIDOC, InputFormat.CSV, InputFormat.MD, ], # whitelist formats, non-matching files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ), }, ) ) conv_results = doc_converter.convert_all(input_paths) for res in conv_results: out_path = Path("scratch") print( f"Document {res.input.file.name} converted." f"\nSaved markdown output to: {out_path!s}" ) _log.debug(res.document._export_to_indented_text(max_text_len=16)) # Export Docling document format to markdowndoc: with (out_path / f"{res.input.file.stem}.md").open("w") as fp: fp.write(res.document.export_to_markdown()) with (out_path / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(res.document.export_to_dict())) with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp: fp.write(yaml.safe_dump(res.document.export_to_dict()))
输入 [ ]
已复制!
if __name__ == "__main__":
main()
if __name__ == "__main__": main()