From 1965f4875c234b4afbc71d596a7906ba523fd4ea Mon Sep 17 00:00:00 2001 From: Jeremias Werner Date: Tue, 16 Sep 2025 17:00:05 +0200 Subject: [PATCH] change docling commands to convert to json instead of markdown --- .../tutorials/docling/README.md | 2 +- .../tutorials/docling/commands.jsonl | 22 +++++++++---------- .../tutorials/inferencing/run | 1 - 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/beta/serverless-fleets/tutorials/docling/README.md b/beta/serverless-fleets/tutorials/docling/README.md index 12de7063..77a7745d 100644 --- a/beta/serverless-fleets/tutorials/docling/README.md +++ b/beta/serverless-fleets/tutorials/docling/README.md @@ -2,7 +2,7 @@ ![](../../images/docling-picture.png) -This tutorial provides a comprehensive guide on using Docling to convert PDFs into Markdown format using serverless fleets. It leverages cloud object storage for managing both the input PDFs and the resulting Markdown files. The process is streamlined using IBM’s Code Engine to build the Docling container, which is then pushed to a container registry. Users can run a serverless fleet, which autonomously spawns workers to run the Docling container for efficient, scalable conversion tasks. +[Docling](https://docling-project.github.io/docling/) simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. This tutorial provides a comprehensive guide on using Docling to convert PDFs into Markdown format using serverless fleets. It leverages [IBM Cloud Object Storage](https://www.ibm.com/de-de/products/cloud-object-storage) for managing the input (PDFs) and the output (Markdown) files. Users can run a serverless fleet, which autonomously spawns workers to run the Docling container for efficient, scalable conversion tasks. This tutorial uses the CPU version of the [docling container image](https://github.com/docling-project/docling-serve?tab=readme-ov-file#container-images) which can easily being replaced using the GPU version in combination with a Serverless GPU. Key steps covered in the Tutorial: 1. Upload the examples PDFs to COS diff --git a/beta/serverless-fleets/tutorials/docling/commands.jsonl b/beta/serverless-fleets/tutorials/docling/commands.jsonl index a15b8dd1..93ce5181 100644 --- a/beta/serverless-fleets/tutorials/docling/commands.jsonl +++ b/beta/serverless-fleets/tutorials/docling/commands.jsonl @@ -1,11 +1,11 @@ -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/2203.01017v2.pdf", "--output", "/output/docling_2203.01017v2.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/2206.01062.pdf", "--output", "/output/docling_2206.01062.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/2305.03393v1-pg9.pdf", "--output", "/output/docling_2305.03393v1-pg9.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/2305.03393v1.pdf", "--output", "/output/docling_2305.03393v1.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/amt_handbook_sample.pdf", "--output", "/output/docling_amt_handbook_sample.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/code_and_formula.pdf", "--output", "/output/docling_code_and_formula.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/picture_classification.pdf", "--output", "/output/docling_picture_classification.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/redp5110_sampled.pdf", "--output", "/output/docling_redp5110_sampled.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/right_to_left_01.pdf", "--output", "/output/docling_right_to_left_01.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/right_to_left_02.pdf", "--output", "/output/docling_right_to_left_02.pdf.md" ]} -{ "cmds":["docling"], "args": ["--num-threads", "12", "/input/pdfs/right_to_left_03.pdf", "--output", "/output/docling_right_to_left_03.pdf.md" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/2203.01017v2.pdf", "--output", "/output/docling_2203.01017v2.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/2206.01062.pdf", "--output", "/output/docling_2206.01062.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/2305.03393v1-pg9.pdf", "--output", "/output/docling_2305.03393v1-pg9.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/2305.03393v1.pdf", "--output", "/output/docling_2305.03393v1.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/amt_handbook_sample.pdf", "--output", "/output/docling_amt_handbook_sample.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/code_and_formula.pdf", "--output", "/output/docling_code_and_formula.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/picture_classification.pdf", "--output", "/output/docling_picture_classification.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/redp5110_sampled.pdf", "--output", "/output/docling_redp5110_sampled.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/right_to_left_01.pdf", "--output", "/output/docling_right_to_left_01.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/right_to_left_02.pdf", "--output", "/output/docling_right_to_left_02.pdf.json" ]} +{ "cmds":["docling"], "args": ["--num-threads", "12", "--to", "json", "/input/pdfs/right_to_left_03.pdf", "--output", "/output/docling_right_to_left_03.pdf.json" ]} diff --git a/beta/serverless-fleets/tutorials/inferencing/run b/beta/serverless-fleets/tutorials/inferencing/run index f1036d9b..aa68b6b8 100755 --- a/beta/serverless-fleets/tutorials/inferencing/run +++ b/beta/serverless-fleets/tutorials/inferencing/run @@ -34,7 +34,6 @@ ibmcloud code-engine beta fleet create --name "fleet-${uuid}-1" \ --max-scale 1 \ --tasks-from-local-file commands.jsonl \ --gpu l40s:1 \ ---cpu 24 \ --memory 120G \ --tasks-state-store fleet-task-store \ --mount-data-store /input=fleet-input-store:/inferencing \