microsoft · echavet · Apr 30, 2026 · May 18, 2026
diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py
@@ -8,7 +8,12 @@
     PRIORITY_SPECIFIC_FILE_FORMAT,
     PRIORITY_GENERIC_FILE_FORMAT,
 )
-from ._base_converter import DocumentConverterResult, DocumentConverter
+from ._base_converter import (
+    DocumentConverterResult,
+    DocumentConverter,
+    ConversionProgress,
+    ProgressCallback,
+)
 from ._stream_info import StreamInfo
 from ._exceptions import (
     MarkItDownException,
@@ -23,6 +28,8 @@
     "MarkItDown",
     "DocumentConverter",
     "DocumentConverterResult",
+    "ConversionProgress",
+    "ProgressCallback",
     "MarkItDownException",
     "MissingDependencyException",
     "FailedConversionAttempt",

diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py
@@ -1,7 +1,47 @@
-from typing import Any, BinaryIO, Optional
+from dataclasses import dataclass
+from typing import Any, BinaryIO, Optional, Protocol, runtime_checkable
+
 from ._stream_info import StreamInfo
 
 
+@dataclass(frozen=True)
+class ConversionProgress:
+    """Progress update emitted by a converter during document processing.
+
+    Attributes:
+        current: Current unit number (1-indexed page, slide, chapter…).
+        total:   Total number of units (0 if unknown).
+        unit:    Semantic label for the unit being processed
+                 (``"page"``, ``"slide"``, ``"chapter"``, ``"sheet"``).
+        source:  Name of the converter class emitting the progress
+                 (e.g. ``"PdfConverter"``).  Useful when several converters
+                 are chained or when the caller wants format-specific logging.
+    """
+
+    current: int
+    total: int
+    unit: str = "page"
+    source: str = ""
+
+
+@runtime_checkable
+class ProgressCallback(Protocol):
+    """Interface for conversion progress callbacks.
+
+    Any callable matching this signature is accepted — no explicit
+    subclassing required (structural subtyping / duck typing).
+
+    Example usage::
+
+        def my_callback(progress: ConversionProgress) -> None:
+            print(f"{progress.source}: {progress.current}/{progress.total} {progress.unit}s")
+
+        result = md.convert("doc.pdf", progress_callback=my_callback)
+    """
+
+    def __call__(self, progress: ConversionProgress) -> None: ...
+
+
 class DocumentConverterResult:
     """The result of converting a document to Markdown."""
 

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -261,6 +261,13 @@ def convert(
             - source: can be a path (str or Path), url, or a requests.response object
             - stream_info: optional stream info to use for the conversion. If None, infer from source
             - kwargs: additional arguments to pass to the converter
+
+        Keyword Args:
+            progress_callback: An optional callable matching
+                :class:`~markitdown.ProgressCallback`.  When provided, the
+                converter will call it with a :class:`~markitdown.ConversionProgress`
+                object for each logical unit processed (page, slide, chapter, sheet).
+                Converters that do not support progress simply ignore it.
         """
 
         # Local path or url

diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -6,7 +6,7 @@
 from typing import BinaryIO, Any, Dict, List
 
 from ._html_converter import HtmlConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -99,7 +99,16 @@ def convert(
 
             # Extract and convert the content
             markdown_content: List[str] = []
-            for file in spine:
+            progress_callback = kwargs.get("progress_callback")
+            total_chapters = len(spine)
+            for chapter_idx, file in enumerate(spine):
+                if progress_callback is not None:
+                    progress_callback(ConversionProgress(
+                        current=chapter_idx + 1,
+                        total=total_chapters,
+                        unit="chapter",
+                        source="EpubConverter",
+                    ))
                 if file in z.namelist():
                     with z.open(file) as f:
                         filename = os.path.basename(file)

diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -29,7 +29,12 @@ def accepts(
 
         for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
             if mimetype.startswith(prefix):
-                # Read further to see if it's a notebook
+                # Read further to see if it's a notebook.
+                # Guard against UnicodeDecodeError: accepts() must never raise —
+                # it should return False for any file it cannot decode.
+                # This can happen with binary files (e.g. French PDF with UTF-8
+                # accented characters: é, è, à → bytes starting with 0xc3)
+                # that happen to share the application/json MIME prefix.
                 cur_pos = file_stream.tell()
                 try:
                     encoding = stream_info.charset or "utf-8"
@@ -38,6 +43,9 @@ def accepts(
                         "nbformat" in notebook_content
                         and "nbformat_minor" in notebook_content
                     )
+                except (UnicodeDecodeError, ValueError):
+                    # File contains non-decodable bytes — definitely not a notebook
+                    return False
                 finally:
                     file_stream.seek(cur_pos)
 

diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -3,7 +3,7 @@
 import re
 from typing import BinaryIO, Any
 
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
@@ -536,6 +536,9 @@ def convert(
 
         assert isinstance(file_stream, io.IOBase)
 
+        # Optional progress callback — reported per page
+        progress_callback = kwargs.get("progress_callback")
+
         # Read file stream into BytesIO for compatibility with pdfplumber
         pdf_bytes = io.BytesIO(file_stream.read())
 
@@ -550,7 +553,15 @@ def convert(
             plain_page_indices: list[int] = []
 
             with pdfplumber.open(pdf_bytes) as pdf:
+                total_pages = len(pdf.pages)
                 for page_idx, page in enumerate(pdf.pages):
+                    if progress_callback is not None:
+                        progress_callback(ConversionProgress(
+                            current=page_idx + 1,
+                            total=total_pages,
+                            unit="page",
+                            source="PdfConverter",
+                        ))
                     page_content = _extract_form_content_from_words(page)
 
                     if page_content is not None:

diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -10,7 +10,7 @@
 
 from ._html_converter import HtmlConverter
 from ._llm_caption import llm_caption
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
@@ -80,10 +80,19 @@ def convert(
 
         # Perform the conversion
         presentation = pptx.Presentation(file_stream)
+        progress_callback = kwargs.get("progress_callback")
         md_content = ""
         slide_num = 0
+        total_slides = len(presentation.slides)
         for slide in presentation.slides:
             slide_num += 1
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=slide_num,
+                    total=total_slides,
+                    unit="slide",
+                    source="PptxConverter",
+                ))
 
             md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
 

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,7 +1,7 @@
 import sys
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
-from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 from .._stream_info import StreamInfo
 
@@ -81,8 +81,18 @@ def convert(
             )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        progress_callback = kwargs.get("progress_callback")
+        sheet_names = list(sheets.keys())
+        total_sheets = len(sheet_names)
         md_content = ""
-        for s in sheets:
+        for sheet_idx, s in enumerate(sheet_names):
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=sheet_idx + 1,
+                    total=total_sheets,
+                    unit="sheet",
+                    source="XlsxConverter",
+                ))
             md_content += f"## {s}\n"
             html_content = sheets[s].to_html(index=False)
             md_content += (
@@ -143,8 +153,18 @@ def convert(
             )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
+        progress_callback = kwargs.get("progress_callback")
+        sheet_names = list(sheets.keys())
+        total_sheets = len(sheet_names)
         md_content = ""
-        for s in sheets:
+        for sheet_idx, s in enumerate(sheet_names):
+            if progress_callback is not None:
+                progress_callback(ConversionProgress(
+                    current=sheet_idx + 1,
+                    total=total_sheets,
+                    unit="sheet",
+                    source="XlsConverter",
+                ))
             md_content += f"## {s}\n"
             html_content = sheets[s].to_html(index=False)
             md_content += (