From a9f654287782b4ed8bbcbd93bc22f77abb9ee114 Mon Sep 17 00:00:00 2001 From: Eric Chavet Date: Thu, 30 Apr 2026 09:46:31 +0200 Subject: [PATCH 1/2] feat: add progress_callback support for conversion tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ConversionProgress dataclass and ProgressCallback Protocol to enable real-time progress reporting during document conversion. Converters emit progress events for each logical unit processed: - PdfConverter: per page - PptxConverter: per slide - EpubConverter: per chapter - XlsxConverter / XlsConverter: per sheet The callback is optional and passed via kwargs (progress_callback). Converters that do not support progress simply ignore it. Fully backward-compatible — no changes to existing API signatures. Signed-off-by: Eric Chavet --- .../markitdown/src/markitdown/__init__.py | 9 +++- .../src/markitdown/_base_converter.py | 42 ++++++++++++++++++- .../markitdown/src/markitdown/_markitdown.py | 7 ++++ .../markitdown/converters/_epub_converter.py | 13 +++++- .../markitdown/converters/_pdf_converter.py | 13 +++++- .../markitdown/converters/_pptx_converter.py | 11 ++++- .../markitdown/converters/_xlsx_converter.py | 26 ++++++++++-- 7 files changed, 112 insertions(+), 9 deletions(-) diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index af356dd63..6b9705ae7 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -8,7 +8,12 @@ PRIORITY_SPECIFIC_FILE_FORMAT, PRIORITY_GENERIC_FILE_FORMAT, ) -from ._base_converter import DocumentConverterResult, DocumentConverter +from ._base_converter import ( + DocumentConverterResult, + DocumentConverter, + ConversionProgress, + ProgressCallback, +) from ._stream_info import StreamInfo from ._exceptions import ( MarkItDownException, @@ -23,6 +28,8 @@ "MarkItDown", "DocumentConverter", "DocumentConverterResult", + "ConversionProgress", + "ProgressCallback", "MarkItDownException", "MissingDependencyException", "FailedConversionAttempt", diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index fa2b11145..b8f492526 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -1,7 +1,47 @@ -from typing import Any, BinaryIO, Optional +from dataclasses import dataclass +from typing import Any, BinaryIO, Optional, Protocol, runtime_checkable + from ._stream_info import StreamInfo +@dataclass(frozen=True) +class ConversionProgress: + """Progress update emitted by a converter during document processing. + + Attributes: + current: Current unit number (1-indexed page, slide, chapter…). + total: Total number of units (0 if unknown). + unit: Semantic label for the unit being processed + (``"page"``, ``"slide"``, ``"chapter"``, ``"sheet"``). + source: Name of the converter class emitting the progress + (e.g. ``"PdfConverter"``). Useful when several converters + are chained or when the caller wants format-specific logging. + """ + + current: int + total: int + unit: str = "page" + source: str = "" + + +@runtime_checkable +class ProgressCallback(Protocol): + """Interface for conversion progress callbacks. + + Any callable matching this signature is accepted — no explicit + subclassing required (structural subtyping / duck typing). + + Example usage:: + + def my_callback(progress: ConversionProgress) -> None: + print(f"{progress.source}: {progress.current}/{progress.total} {progress.unit}s") + + result = md.convert("doc.pdf", progress_callback=my_callback) + """ + + def __call__(self, progress: ConversionProgress) -> None: ... + + class DocumentConverterResult: """The result of converting a document to Markdown.""" diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..6511d44df 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -261,6 +261,13 @@ def convert( - source: can be a path (str or Path), url, or a requests.response object - stream_info: optional stream info to use for the conversion. If None, infer from source - kwargs: additional arguments to pass to the converter + + Keyword Args: + progress_callback: An optional callable matching + :class:`~markitdown.ProgressCallback`. When provided, the + converter will call it with a :class:`~markitdown.ConversionProgress` + object for each logical unit processed (page, slide, chapter, sheet). + Converters that do not support progress simply ignore it. """ # Local path or url diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 3be65b016..63b6f3ec7 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -6,7 +6,7 @@ from typing import BinaryIO, Any, Dict, List from ._html_converter import HtmlConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverterResult, ConversionProgress from .._stream_info import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ @@ -99,7 +99,16 @@ def convert( # Extract and convert the content markdown_content: List[str] = [] - for file in spine: + progress_callback = kwargs.get("progress_callback") + total_chapters = len(spine) + for chapter_idx, file in enumerate(spine): + if progress_callback is not None: + progress_callback(ConversionProgress( + current=chapter_idx + 1, + total=total_chapters, + unit="chapter", + source="EpubConverter", + )) if file in z.namelist(): with z.open(file) as f: filename = os.path.basename(file) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..28527d6e4 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -3,7 +3,7 @@ import re from typing import BinaryIO, Any -from .._base_converter import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -536,6 +536,9 @@ def convert( assert isinstance(file_stream, io.IOBase) + # Optional progress callback — reported per page + progress_callback = kwargs.get("progress_callback") + # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) @@ -550,7 +553,15 @@ def convert( plain_page_indices: list[int] = [] with pdfplumber.open(pdf_bytes) as pdf: + total_pages = len(pdf.pages) for page_idx, page in enumerate(pdf.pages): + if progress_callback is not None: + progress_callback(ConversionProgress( + current=page_idx + 1, + total=total_pages, + unit="page", + source="PdfConverter", + )) page_content = _extract_form_content_from_words(page) if page_content is not None: diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 360f17706..05b15c2da 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -10,7 +10,7 @@ from ._html_converter import HtmlConverter from ._llm_caption import llm_caption -from .._base_converter import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -80,10 +80,19 @@ def convert( # Perform the conversion presentation = pptx.Presentation(file_stream) + progress_callback = kwargs.get("progress_callback") md_content = "" slide_num = 0 + total_slides = len(presentation.slides) for slide in presentation.slides: slide_num += 1 + if progress_callback is not None: + progress_callback(ConversionProgress( + current=slide_num, + total=total_slides, + unit="slide", + source="PptxConverter", + )) md_content += f"\n\n\n" diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..546a95c11 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,7 +1,7 @@ import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter -from .._base_converter import DocumentConverter, DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._stream_info import StreamInfo @@ -81,8 +81,18 @@ def convert( ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + progress_callback = kwargs.get("progress_callback") + sheet_names = list(sheets.keys()) + total_sheets = len(sheet_names) md_content = "" - for s in sheets: + for sheet_idx, s in enumerate(sheet_names): + if progress_callback is not None: + progress_callback(ConversionProgress( + current=sheet_idx + 1, + total=total_sheets, + unit="sheet", + source="XlsxConverter", + )) md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( @@ -143,8 +153,18 @@ def convert( ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") + progress_callback = kwargs.get("progress_callback") + sheet_names = list(sheets.keys()) + total_sheets = len(sheet_names) md_content = "" - for s in sheets: + for sheet_idx, s in enumerate(sheet_names): + if progress_callback is not None: + progress_callback(ConversionProgress( + current=sheet_idx + 1, + total=total_sheets, + unit="sheet", + source="XlsConverter", + )) md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( From 8845fec19bea3e6b1dbfc7cd57b231374305b3a5 Mon Sep 17 00:00:00 2001 From: Eric Chavet Date: Mon, 18 May 2026 10:36:04 +0200 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20IpynbConverter.accepts()=20=E2=80=94?= =?UTF-8?q?=20catch=20UnicodeDecodeError=20for=20non-ASCII=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit accepts() must never raise — it is a predicate that returns True/False. When a file contains non-ASCII bytes (e.g. a French PDF with accented characters encoded as multi-byte UTF-8 sequences like 0xc3...), decoding with 'ascii' or even 'utf-8' can fail if the stream contains arbitrary binary content. The fix wraps the decode + check block in a try/except (UnicodeDecodeError, ValueError) and returns False on failure. A file that cannot be decoded is definitively not a Jupyter notebook. Fixes: https://github.com/microsoft/markitdown/issues/1894 --- .../src/markitdown/converters/_ipynb_converter.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index b15e77aa2..6d7bf6d82 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -29,7 +29,12 @@ def accepts( for prefix in CANDIDATE_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): - # Read further to see if it's a notebook + # Read further to see if it's a notebook. + # Guard against UnicodeDecodeError: accepts() must never raise — + # it should return False for any file it cannot decode. + # This can happen with binary files (e.g. French PDF with UTF-8 + # accented characters: é, è, à → bytes starting with 0xc3) + # that happen to share the application/json MIME prefix. cur_pos = file_stream.tell() try: encoding = stream_info.charset or "utf-8" @@ -38,6 +43,9 @@ def accepts( "nbformat" in notebook_content and "nbformat_minor" in notebook_content ) + except (UnicodeDecodeError, ValueError): + # File contains non-decodable bytes — definitely not a notebook + return False finally: file_stream.seek(cur_pos)