Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,57 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
return [table_rows]


def _extract_text_with_pymupdf(pdf_bytes: io.BytesIO) -> str | None:
"""
Extract PDF text with PyMuPDF when it is available.

PyMuPDF is intentionally not a hard dependency of the PDF converter. This
fallback is used only when callers have installed it themselves.
"""
try:
import pymupdf
except ImportError:
try:
import fitz as pymupdf # type: ignore[no-redef]
except ImportError:
return None

current_position = pdf_bytes.tell()
try:
pdf_bytes.seek(0)
with pymupdf.open(stream=pdf_bytes.read(), filetype="pdf") as doc:
return "\n".join(page.get_text("text") for page in doc)
except Exception:
return None
finally:
pdf_bytes.seek(current_position)


def _prefer_pymupdf_text_if_substantially_better(
markdown: str,
pdf_bytes: io.BytesIO,
) -> str:
"""
Prefer PyMuPDF output when pdfminer/pdfplumber likely truncated a page.

Some inline-image streams can confuse pdfminer-family tokenizers so text
after the image is silently dropped. PyMuPDF is used only when it returns
substantially more text, which avoids replacing table/form formatting for
ordinary PDFs where both extractors saw the same content.
"""
pymupdf_text = _extract_text_with_pymupdf(pdf_bytes)
if not pymupdf_text:
return markdown

existing_len = len(markdown.strip())
pymupdf_len = len(pymupdf_text.strip())

if pymupdf_len > existing_len * 1.3 + 500:
return pymupdf_text

return markdown


class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown.
Expand Down Expand Up @@ -583,6 +634,8 @@ def convert(
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)

markdown = _prefer_pymupdf_text_if_substantially_better(markdown, pdf_bytes)

# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)

Expand Down
56 changes: 56 additions & 0 deletions packages/markitdown/tests/test_pdf_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,16 @@
import gc
import io
import os
import sys
import tracemalloc

import pytest
from unittest.mock import patch, MagicMock

from markitdown import MarkItDown
from markitdown.converters._pdf_converter import (
_prefer_pymupdf_text_if_substantially_better,
)

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")

Expand Down Expand Up @@ -80,6 +84,58 @@ def mock_open(stream):
return mock_open


def _install_fake_pymupdf(monkeypatch, text: str):
"""Install a tiny fake PyMuPDF module for fallback selection tests."""

class FakePage:
def get_text(self, mode):
assert mode == "text"
return text

class FakeDoc(list):
def __enter__(self):
return self

def __exit__(self, exc_type, exc, traceback):
return False

class FakePyMuPDF:
@staticmethod
def open(stream, filetype):
assert stream == b"fake pdf content"
assert filetype == "pdf"
return FakeDoc([FakePage()])

monkeypatch.setitem(sys.modules, "pymupdf", FakePyMuPDF)


def test_pymupdf_fallback_replaces_suspiciously_short_text(monkeypatch):
"""PyMuPDF should win when it recovers substantially more PDF text."""
recovered_text = "AFTER_IMAGE recovered text " * 80
_install_fake_pymupdf(monkeypatch, recovered_text)

pdf_bytes = io.BytesIO(b"fake pdf content")
pdf_bytes.seek(4)
result = _prefer_pymupdf_text_if_substantially_better("BEFORE_IMAGE", pdf_bytes)

assert result == recovered_text
assert pdf_bytes.tell() == 4


def test_pymupdf_fallback_keeps_existing_text_when_not_substantially_better(
monkeypatch,
):
"""Existing pdfminer/pdfplumber text should remain unless clearly truncated."""
_install_fake_pymupdf(monkeypatch, "same content with minor formatting changes")

result = _prefer_pymupdf_text_if_substantially_better(
"same content",
io.BytesIO(b"fake pdf content"),
)

assert result == "same content"


class TestPdfMemoryOptimization:
"""Test that PDF conversion cleans up per-page caches to limit memory."""

Expand Down