Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion packages/markitdown/src/markitdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
PRIORITY_SPECIFIC_FILE_FORMAT,
PRIORITY_GENERIC_FILE_FORMAT,
)
from ._base_converter import DocumentConverterResult, DocumentConverter
from ._base_converter import (
DocumentConverterResult,
DocumentConverter,
ConversionProgress,
ProgressCallback,
)
from ._stream_info import StreamInfo
from ._exceptions import (
MarkItDownException,
Expand All @@ -23,6 +28,8 @@
"MarkItDown",
"DocumentConverter",
"DocumentConverterResult",
"ConversionProgress",
"ProgressCallback",
"MarkItDownException",
"MissingDependencyException",
"FailedConversionAttempt",
Expand Down
42 changes: 41 additions & 1 deletion packages/markitdown/src/markitdown/_base_converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,47 @@
from typing import Any, BinaryIO, Optional
from dataclasses import dataclass
from typing import Any, BinaryIO, Optional, Protocol, runtime_checkable

from ._stream_info import StreamInfo


@dataclass(frozen=True)
class ConversionProgress:
"""Progress update emitted by a converter during document processing.

Attributes:
current: Current unit number (1-indexed page, slide, chapter…).
total: Total number of units (0 if unknown).
unit: Semantic label for the unit being processed
(``"page"``, ``"slide"``, ``"chapter"``, ``"sheet"``).
source: Name of the converter class emitting the progress
(e.g. ``"PdfConverter"``). Useful when several converters
are chained or when the caller wants format-specific logging.
"""

current: int
total: int
unit: str = "page"
source: str = ""


@runtime_checkable
class ProgressCallback(Protocol):
"""Interface for conversion progress callbacks.

Any callable matching this signature is accepted — no explicit
subclassing required (structural subtyping / duck typing).

Example usage::

def my_callback(progress: ConversionProgress) -> None:
print(f"{progress.source}: {progress.current}/{progress.total} {progress.unit}s")

result = md.convert("doc.pdf", progress_callback=my_callback)
"""

def __call__(self, progress: ConversionProgress) -> None: ...


class DocumentConverterResult:
"""The result of converting a document to Markdown."""

Expand Down
7 changes: 7 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,13 @@ def convert(
- source: can be a path (str or Path), url, or a requests.response object
- stream_info: optional stream info to use for the conversion. If None, infer from source
- kwargs: additional arguments to pass to the converter

Keyword Args:
progress_callback: An optional callable matching
:class:`~markitdown.ProgressCallback`. When provided, the
converter will call it with a :class:`~markitdown.ConversionProgress`
object for each logical unit processed (page, slide, chapter, sheet).
Converters that do not support progress simply ignore it.
"""

# Local path or url
Expand Down
13 changes: 11 additions & 2 deletions packages/markitdown/src/markitdown/converters/_epub_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import BinaryIO, Any, Dict, List

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverterResult
from .._base_converter import DocumentConverterResult, ConversionProgress
from .._stream_info import StreamInfo

ACCEPTED_MIME_TYPE_PREFIXES = [
Expand Down Expand Up @@ -99,7 +99,16 @@ def convert(

# Extract and convert the content
markdown_content: List[str] = []
for file in spine:
progress_callback = kwargs.get("progress_callback")
total_chapters = len(spine)
for chapter_idx, file in enumerate(spine):
if progress_callback is not None:
progress_callback(ConversionProgress(
current=chapter_idx + 1,
total=total_chapters,
unit="chapter",
source="EpubConverter",
))
if file in z.namelist():
with z.open(file) as f:
filename = os.path.basename(file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@ def accepts(

for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
# Read further to see if it's a notebook
# Read further to see if it's a notebook.
# Guard against UnicodeDecodeError: accepts() must never raise —
# it should return False for any file it cannot decode.
# This can happen with binary files (e.g. French PDF with UTF-8
# accented characters: é, è, à → bytes starting with 0xc3)
# that happen to share the application/json MIME prefix.
cur_pos = file_stream.tell()
try:
encoding = stream_info.charset or "utf-8"
Expand All @@ -38,6 +43,9 @@ def accepts(
"nbformat" in notebook_content
and "nbformat_minor" in notebook_content
)
except (UnicodeDecodeError, ValueError):
# File contains non-decodable bytes — definitely not a notebook
return False
finally:
file_stream.seek(cur_pos)

Expand Down
13 changes: 12 additions & 1 deletion packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from typing import BinaryIO, Any

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

Expand Down Expand Up @@ -536,6 +536,9 @@ def convert(

assert isinstance(file_stream, io.IOBase)

# Optional progress callback — reported per page
progress_callback = kwargs.get("progress_callback")

# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())

Expand All @@ -550,7 +553,15 @@ def convert(
plain_page_indices: list[int] = []

with pdfplumber.open(pdf_bytes) as pdf:
total_pages = len(pdf.pages)
for page_idx, page in enumerate(pdf.pages):
if progress_callback is not None:
progress_callback(ConversionProgress(
current=page_idx + 1,
total=total_pages,
unit="page",
source="PdfConverter",
))
page_content = _extract_form_content_from_words(page)

if page_content is not None:
Expand Down
11 changes: 10 additions & 1 deletion packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

Expand Down Expand Up @@ -80,10 +80,19 @@ def convert(

# Perform the conversion
presentation = pptx.Presentation(file_stream)
progress_callback = kwargs.get("progress_callback")
md_content = ""
slide_num = 0
total_slides = len(presentation.slides)
for slide in presentation.slides:
slide_num += 1
if progress_callback is not None:
progress_callback(ConversionProgress(
current=slide_num,
total=total_slides,
unit="slide",
source="PptxConverter",
))

md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"

Expand Down
26 changes: 23 additions & 3 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverter, DocumentConverterResult, ConversionProgress
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo

Expand Down Expand Up @@ -81,8 +81,18 @@ def convert(
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
progress_callback = kwargs.get("progress_callback")
sheet_names = list(sheets.keys())
total_sheets = len(sheet_names)
md_content = ""
for s in sheets:
for sheet_idx, s in enumerate(sheet_names):
if progress_callback is not None:
progress_callback(ConversionProgress(
current=sheet_idx + 1,
total=total_sheets,
unit="sheet",
source="XlsxConverter",
))
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
Expand Down Expand Up @@ -143,8 +153,18 @@ def convert(
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
progress_callback = kwargs.get("progress_callback")
sheet_names = list(sheets.keys())
total_sheets = len(sheet_names)
md_content = ""
for s in sheets:
for sheet_idx, s in enumerate(sheet_names):
if progress_callback is not None:
progress_callback(ConversionProgress(
current=sheet_idx + 1,
total=total_sheets,
unit="sheet",
source="XlsConverter",
))
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
Expand Down