diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..f243a3c83 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -492,6 +492,57 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: return [table_rows] +def _extract_text_with_pymupdf(pdf_bytes: io.BytesIO) -> str | None: + """ + Extract PDF text with PyMuPDF when it is available. + + PyMuPDF is intentionally not a hard dependency of the PDF converter. This + fallback is used only when callers have installed it themselves. + """ + try: + import pymupdf + except ImportError: + try: + import fitz as pymupdf # type: ignore[no-redef] + except ImportError: + return None + + current_position = pdf_bytes.tell() + try: + pdf_bytes.seek(0) + with pymupdf.open(stream=pdf_bytes.read(), filetype="pdf") as doc: + return "\n".join(page.get_text("text") for page in doc) + except Exception: + return None + finally: + pdf_bytes.seek(current_position) + + +def _prefer_pymupdf_text_if_substantially_better( + markdown: str, + pdf_bytes: io.BytesIO, +) -> str: + """ + Prefer PyMuPDF output when pdfminer/pdfplumber likely truncated a page. + + Some inline-image streams can confuse pdfminer-family tokenizers so text + after the image is silently dropped. PyMuPDF is used only when it returns + substantially more text, which avoids replacing table/form formatting for + ordinary PDFs where both extractors saw the same content. + """ + pymupdf_text = _extract_text_with_pymupdf(pdf_bytes) + if not pymupdf_text: + return markdown + + existing_len = len(markdown.strip()) + pymupdf_len = len(pymupdf_text.strip()) + + if pymupdf_len > existing_len * 1.3 + 500: + return pymupdf_text + + return markdown + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. @@ -583,6 +634,8 @@ def convert( pdf_bytes.seek(0) markdown = pdfminer.high_level.extract_text(pdf_bytes) + markdown = _prefer_pymupdf_text_if_substantially_better(markdown, pdf_bytes) + # Post-process to merge MasterFormat-style partial numbering with following text markdown = _merge_partial_numbering_lines(markdown) diff --git a/packages/markitdown/tests/test_pdf_memory.py b/packages/markitdown/tests/test_pdf_memory.py index 1731dd63e..cc7f2bdc0 100644 --- a/packages/markitdown/tests/test_pdf_memory.py +++ b/packages/markitdown/tests/test_pdf_memory.py @@ -11,12 +11,16 @@ import gc import io import os +import sys import tracemalloc import pytest from unittest.mock import patch, MagicMock from markitdown import MarkItDown +from markitdown.converters._pdf_converter import ( + _prefer_pymupdf_text_if_substantially_better, +) TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") @@ -80,6 +84,58 @@ def mock_open(stream): return mock_open +def _install_fake_pymupdf(monkeypatch, text: str): + """Install a tiny fake PyMuPDF module for fallback selection tests.""" + + class FakePage: + def get_text(self, mode): + assert mode == "text" + return text + + class FakeDoc(list): + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, traceback): + return False + + class FakePyMuPDF: + @staticmethod + def open(stream, filetype): + assert stream == b"fake pdf content" + assert filetype == "pdf" + return FakeDoc([FakePage()]) + + monkeypatch.setitem(sys.modules, "pymupdf", FakePyMuPDF) + + +def test_pymupdf_fallback_replaces_suspiciously_short_text(monkeypatch): + """PyMuPDF should win when it recovers substantially more PDF text.""" + recovered_text = "AFTER_IMAGE recovered text " * 80 + _install_fake_pymupdf(monkeypatch, recovered_text) + + pdf_bytes = io.BytesIO(b"fake pdf content") + pdf_bytes.seek(4) + result = _prefer_pymupdf_text_if_substantially_better("BEFORE_IMAGE", pdf_bytes) + + assert result == recovered_text + assert pdf_bytes.tell() == 4 + + +def test_pymupdf_fallback_keeps_existing_text_when_not_substantially_better( + monkeypatch, +): + """Existing pdfminer/pdfplumber text should remain unless clearly truncated.""" + _install_fake_pymupdf(monkeypatch, "same content with minor formatting changes") + + result = _prefer_pymupdf_text_if_substantially_better( + "same content", + io.BytesIO(b"fake pdf content"), + ) + + assert result == "same content" + + class TestPdfMemoryOptimization: """Test that PDF conversion cleans up per-page caches to limit memory."""