表格导出
In [ ]
已复制!
import logging
import time
from pathlib import Path
import logging import time from pathlib import Path
In [ ]
已复制!
import pandas as pd
import pandas as pd
In [ ]
已复制!
from docling.document_converter import DocumentConverter
from docling.document_converter import DocumentConverter
In [ ]
已复制!
_log = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
In [ ]
已复制!
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")
doc_converter = DocumentConverter()
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html(doc=conv_res.document))
end_time = time.time() - start_time
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
def main(): logging.basicConfig(level=logging.INFO) input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") output_dir = Path("scratch") doc_converter = DocumentConverter() start_time = time.time() conv_res = doc_converter.convert(input_doc_path) output_dir.mkdir(parents=True, exist_ok=True) doc_filename = conv_res.input.file.stem # Export tables for table_ix, table in enumerate(conv_res.document.tables): table_df: pd.DataFrame = table.export_to_dataframe() print(f"## 表格 {table_ix}") print(table_df.to_markdown()) # Save the table as csv element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv" _log.info(f"正在将 CSV 表格保存到 {element_csv_filename}") table_df.to_csv(element_csv_filename) # Save the table as html element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html" _log.info(f"正在将 HTML 表格保存到 {element_html_filename}") with element_html_filename.open("w") as fp: fp.write(table.export_to_html(doc=conv_res.document)) end_time = time.time() - start_time _log.info(f"文档转换并导出表格耗时 {end_time:.2f} 秒。")
In [ ]
已复制!
if __name__ == "__main__":
main()
if __name__ == "__main__": main()