From a9f654287782b4ed8bbcbd93bc22f77abb9ee114 Mon Sep 17 00:00:00 2001
From: Eric Chavet <echavet@gmail.com>
Date: Thu, 30 Apr 2026 09:46:31 +0200
Subject: [PATCH 1/2] feat: add progress_callback support for conversion
 tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ConversionProgress dataclass and ProgressCallback Protocol to enable
real-time progress reporting during document conversion.

Converters emit progress events for each logical unit processed:
- PdfConverter: per page
- PptxConverter: per slide
- EpubConverter: per chapter
- XlsxConverter / XlsConverter: per sheet

The callback is optional and passed via kwargs (progress_callback).
Converters that do not support progress simply ignore it.
Fully backward-compatible — no changes to existing API signatures.

Signed-off-by: Eric Chavet <echavet@gmail.com>
---
 .../markitdown/src/markitdown/__init__.py     |  9 +++-
 .../src/markitdown/_base_converter.py         | 42 ++++++++++++++++++-
 .../markitdown/src/markitdown/_markitdown.py  |  7 ++++
 .../markitdown/converters/_epub_converter.py  | 13 +++++-
 .../markitdown/converters/_pdf_converter.py   | 13 +++++-
 .../markitdown/converters/_pptx_converter.py  | 11 ++++-
 .../markitdown/converters/_xlsx_converter.py  | 26 ++++++++++--
 7 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py
index af356dd63..6b9705ae7 100644
--- a/packages/markitdown/src/markitdown/__init__.py
+++ b/packages/markitdown/src/markitdown/__init__.py
@@ -8,7 +8,12 @@
     PRIORITY_SPECIFIC_FILE_FORMAT,
     PRIORITY_GENERIC_FILE_FORMAT,
 )
-from ._base_converter import DocumentConverterResult, DocumentConverter
+from ._base_converter import (
+    DocumentConverterResult,
+    DocumentConverter,
+    ConversionProgress,
+    ProgressCallback,
+)
 from ._stream_info import StreamInfo
 from ._exceptions import (
     MarkItDownException,
@@ -23,6 +28,8 @@
     "MarkItDown",
     "DocumentConverter",
     "DocumentConverterResult",
+    "ConversionProgress",
+    "ProgressCallback",
     "MarkItDownException",
     "MissingDependencyException",
     "FailedConversionAttempt",
diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py
index fa2b11145..b8f492526 100644
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@@ -1,7 +1,47 @@
-from typing import Any, BinaryIO, Optional
+from dataclasses import dataclass
+from typing import Any, BinaryIO, Optional, Protocol, runtime_checkable
+
 from ._stream_info import StreamInfo
 
 
+@dataclass(frozen=True)
+class ConversionProgress:
+    """Progress update emitted by a converter during document processing.
+
+    Attributes:
+        current: Current unit number (1-indexed page, slide, chapter…).
+        total:   Total number of units (0 if unknown).
+        unit:    Semantic label for the unit being processed
+                 (``"page"``, ``"slide"``, ``"chapter"``, ``"sheet"``).
+        source:  Name of the converter class emitting the progress
+                 (e.g. ``"PdfConverter"``).  Useful when several converters
+                 are chained or when the caller wants format-specific logging.
+    """
+
+    current: int
+    total: int
+    unit: str = "page"
+    source: str = ""
+
+
+@runtime_checkable
+class ProgressCallback(Protocol):
+    """Interface for conversion progress callbacks.
+
+    Any callable matching this signature is accepted — no explicit
+    subclassing required (structural subtyping / duck typing).
+
+    Example usage::
+
+        def my_callback(progress: ConversionProgress) -> None:
+            print(f"{progress.source}: {progress.current}/{progress.total} {progress.unit}s")
+
+        result = md.convert("doc.pdf", progress_callback=my_callback)
+    """
+
+    def __call__(self, progress: ConversionProgress) -> None: ...
+
+
 class DocumentConverterResult:
     """The result of converting a document to Markdown."""
 
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f342a614b..6511d44df 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -261,6 +261,13 @@ def convert(
             - source: can be a path (str or Path), url, or a requests.response object
             - stream_info: optional stream info to use for the conversion. If None, infer from source
             - kwargs: additional arguments to pass to the converter
+
+        Keyword Args:
+            progress_callback: An optional callable matching
+                :class:`~markitdown.ProgressCallback`.  When provided, the
+                converter will call it with a :class:`~markitdown.ConversionProgress`
+                object for each logical unit processed (page, slide, chapter, sheet).
+                Converters that do not support progress simply ignore it.
         """
 
         # Local path or url
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
index 3be65b016..63b6f3ec7 100644
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -6,7 +6,7 @@
 from typing import BinaryIO, Any, Dict, List
 
 from ._html_converter import HtmlConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -99,7 +99,16 @@ def convert(
 
             # Extract and convert the content
             markdown_content: List[str] = []
-            for file in spine:
+            progress_callback = kwargs.get("progress_callback")
+            total_chapters = len(spine)
+            for chapter_idx, file in enumerate(spine):
+                if progress_callback is not None:
+                    progress_callback(ConversionProgress(
+                        current=chapter_idx + 1,
+                        total=total_chapters,
+                        unit="chapter",
+                        source="EpubConverter",
+                    ))
                 if file in z.namelist():
                     with z.open(file) as f:
                         filename = os.path.basename(file)
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index ffbcbd990..28527d6e4 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -3,7 +3,7 @@
 import re
 from typing import BinaryIO, Any
 
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
@@ -536,6 +536,9 @@ def convert(
 
         assert isinstance(file_stream, io.IOBase)
 
+        # Optional progress callback — reported per page
+        progress_callback = kwargs.get("progress_callback")
+
         # Read file stream into BytesIO for compatibility with pdfplumber
         pdf_bytes = io.BytesIO(file_stream.read())
 
@@ -550,7 +553,15 @@ def convert(
             plain_page_indices: list[int] = []
 
             with pdfplumber.open(pdf_bytes) as pdf:
+                total_pages = len(pdf.pages)
                 for page_idx, page in enumerate(pdf.pages):
+                    if progress_callback is not None:
+                        progress_callback(ConversionProgress(
+                            current=page_idx + 1,
+                            total=total_pages,
+                            unit="page",
+                            source="PdfConverter",
+                        ))
                     page_content = _extract_form_content_from_words(page)
 
                     if page_content is not None:
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 360f17706..05b15c2da 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -10,7 +10,7 @@
 
 from ._html_converter import HtmlConverter
 from ._llm_caption import llm_caption
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
@@ -80,10 +80,19 @@ def convert(
 
         # Perform the conversion
         presentation = pptx.Presentation(file_stream)
+        progress_callback = kwargs.get("progress_callback")
         md_content = ""
         slide_num = 0
+        total_slides = len(presentation.slides)
         for slide in presentation.slides:
             slide_num += 1
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=slide_num,
+                    total=total_slides,
+                    unit="slide",
+                    source="PptxConverter",
+                ))
 
             md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
 
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 4186ec773..546a95c11 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,7 +1,7 @@
 import sys
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 from .._stream_info import StreamInfo
 
@@ -81,8 +81,18 @@ def convert(
             )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        progress_callback = kwargs.get("progress_callback")
+        sheet_names = list(sheets.keys())
+        total_sheets = len(sheet_names)
         md_content = ""
-        for s in sheets:
+        for sheet_idx, s in enumerate(sheet_names):
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=sheet_idx + 1,
+                    total=total_sheets,
+                    unit="sheet",
+                    source="XlsxConverter",
+                ))
             md_content += f"## {s}\n"
             html_content = sheets[s].to_html(index=False)
             md_content += (
@@ -143,8 +153,18 @@ def convert(
             )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
+        progress_callback = kwargs.get("progress_callback")
+        sheet_names = list(sheets.keys())
+        total_sheets = len(sheet_names)
         md_content = ""
-        for s in sheets:
+        for sheet_idx, s in enumerate(sheet_names):
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=sheet_idx + 1,
+                    total=total_sheets,
+                    unit="sheet",
+                    source="XlsConverter",
+                ))
             md_content += f"## {s}\n"
             html_content = sheets[s].to_html(index=False)
             md_content += (

From 8845fec19bea3e6b1dbfc7cd57b231374305b3a5 Mon Sep 17 00:00:00 2001
From: Eric Chavet <echavet@gmail.com>
Date: Mon, 18 May 2026 10:36:04 +0200
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20IpynbConverter.accepts()=20=E2=80=94?=
 =?UTF-8?q?=20catch=20UnicodeDecodeError=20for=20non-ASCII=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

accepts() must never raise — it is a predicate that returns True/False.
When a file contains non-ASCII bytes (e.g. a French PDF with accented
characters encoded as multi-byte UTF-8 sequences like 0xc3...), decoding
with 'ascii' or even 'utf-8' can fail if the stream contains arbitrary
binary content.

The fix wraps the decode + check block in a try/except
(UnicodeDecodeError, ValueError) and returns False on failure.
A file that cannot be decoded is definitively not a Jupyter notebook.

Fixes: https://github.com/microsoft/markitdown/issues/1894
---
 .../src/markitdown/converters/_ipynb_converter.py      | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
index b15e77aa2..6d7bf6d82 100644
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -29,7 +29,12 @@ def accepts(
 
         for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
             if mimetype.startswith(prefix):
-                # Read further to see if it's a notebook
+                # Read further to see if it's a notebook.
+                # Guard against UnicodeDecodeError: accepts() must never raise —
+                # it should return False for any file it cannot decode.
+                # This can happen with binary files (e.g. French PDF with UTF-8
+                # accented characters: é, è, à → bytes starting with 0xc3)
+                # that happen to share the application/json MIME prefix.
                 cur_pos = file_stream.tell()
                 try:
                     encoding = stream_info.charset or "utf-8"
@@ -38,6 +43,9 @@ def accepts(
                         "nbformat" in notebook_content
                         and "nbformat_minor" in notebook_content
                     )
+                except (UnicodeDecodeError, ValueError):
+                    # File contains non-decodable bytes — definitely not a notebook
+                    return False
                 finally:
                     file_stream.seek(cur_pos)