microsoft · MukundaKatta · May 15, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -492,6 +492,57 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
     return [table_rows]
 
 
+def _extract_text_with_pymupdf(pdf_bytes: io.BytesIO) -> str | None:
+    """
+    Extract PDF text with PyMuPDF when it is available.
+
+    PyMuPDF is intentionally not a hard dependency of the PDF converter. This
+    fallback is used only when callers have installed it themselves.
+    """
+    try:
+        import pymupdf
+    except ImportError:
+        try:
+            import fitz as pymupdf  # type: ignore[no-redef]
+        except ImportError:
+            return None
+
+    current_position = pdf_bytes.tell()
+    try:
+        pdf_bytes.seek(0)
+        with pymupdf.open(stream=pdf_bytes.read(), filetype="pdf") as doc:
+            return "\n".join(page.get_text("text") for page in doc)
+    except Exception:
+        return None
+    finally:
+        pdf_bytes.seek(current_position)
+
+
+def _prefer_pymupdf_text_if_substantially_better(
+    markdown: str,
+    pdf_bytes: io.BytesIO,
+) -> str:
+    """
+    Prefer PyMuPDF output when pdfminer/pdfplumber likely truncated a page.
+
+    Some inline-image streams can confuse pdfminer-family tokenizers so text
+    after the image is silently dropped. PyMuPDF is used only when it returns
+    substantially more text, which avoids replacing table/form formatting for
+    ordinary PDFs where both extractors saw the same content.
+    """
+    pymupdf_text = _extract_text_with_pymupdf(pdf_bytes)
+    if not pymupdf_text:
+        return markdown
+
+    existing_len = len(markdown.strip())
+    pymupdf_len = len(pymupdf_text.strip())
+
+    if pymupdf_len > existing_len * 1.3 + 500:
+        return pymupdf_text
+
+    return markdown
+
+
 class PdfConverter(DocumentConverter):
     """
     Converts PDFs to Markdown.
@@ -583,6 +634,8 @@ def convert(
             pdf_bytes.seek(0)
             markdown = pdfminer.high_level.extract_text(pdf_bytes)
 
+        markdown = _prefer_pymupdf_text_if_substantially_better(markdown, pdf_bytes)
+
         # Post-process to merge MasterFormat-style partial numbering with following text
         markdown = _merge_partial_numbering_lines(markdown)
 

diff --git a/packages/markitdown/tests/test_pdf_memory.py b/packages/markitdown/tests/test_pdf_memory.py
@@ -11,12 +11,16 @@
 import gc
 import io
 import os
+import sys
 import tracemalloc
 
 import pytest
 from unittest.mock import patch, MagicMock
 
 from markitdown import MarkItDown
+from markitdown.converters._pdf_converter import (
+    _prefer_pymupdf_text_if_substantially_better,
+)
 
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 
@@ -80,6 +84,58 @@ def mock_open(stream):
     return mock_open
 
 
+def _install_fake_pymupdf(monkeypatch, text: str):
+    """Install a tiny fake PyMuPDF module for fallback selection tests."""
+
+    class FakePage:
+        def get_text(self, mode):
+            assert mode == "text"
+            return text
+
+    class FakeDoc(list):
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, traceback):
+            return False
+
+    class FakePyMuPDF:
+        @staticmethod
+        def open(stream, filetype):
+            assert stream == b"fake pdf content"
+            assert filetype == "pdf"
+            return FakeDoc([FakePage()])
+
+    monkeypatch.setitem(sys.modules, "pymupdf", FakePyMuPDF)
+
+
+def test_pymupdf_fallback_replaces_suspiciously_short_text(monkeypatch):
+    """PyMuPDF should win when it recovers substantially more PDF text."""
+    recovered_text = "AFTER_IMAGE recovered text " * 80
+    _install_fake_pymupdf(monkeypatch, recovered_text)
+
+    pdf_bytes = io.BytesIO(b"fake pdf content")
+    pdf_bytes.seek(4)
+    result = _prefer_pymupdf_text_if_substantially_better("BEFORE_IMAGE", pdf_bytes)
+
+    assert result == recovered_text
+    assert pdf_bytes.tell() == 4
+
+
+def test_pymupdf_fallback_keeps_existing_text_when_not_substantially_better(
+    monkeypatch,
+):
+    """Existing pdfminer/pdfplumber text should remain unless clearly truncated."""
+    _install_fake_pymupdf(monkeypatch, "same content with minor formatting changes")
+
+    result = _prefer_pymupdf_text_if_substantially_better(
+        "same content",
+        io.BytesIO(b"fake pdf content"),
+    )
+
+    assert result == "same content"
+
+
 class TestPdfMemoryOptimization:
     """Test that PDF conversion cleans up per-page caches to limit memory."""