Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BingSerpConverter,
PdfConverter,
DocxConverter,
DocConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
Expand Down Expand Up @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(DocConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._doc_converter import DocConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
Expand All @@ -34,6 +35,7 @@
"BingSerpConverter",
"PdfConverter",
"DocxConverter",
"DocConverter",
"XlsxConverter",
"XlsConverter",
"PptxConverter",
Expand Down
73 changes: 73 additions & 0 deletions packages/markitdown/src/markitdown/converters/_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import sys
from typing import Any, BinaryIO

from .._stream_info import StreamInfo
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, FileConversionException

#using unword dependency
_dependency_exc_info = None
unword = None
try:
import unword as _unword
unword = _unword
except ImportError:
_dependency_exc_info = sys.exc_info()

ACCEPTED_MIME_TYPE_PREFIXES = [
"application/msword", "application/x-msword"
]

ACCEPTED_FILE_EXTENSIONS = [".doc"]


class DocConverter(DocumentConverter):
"""
Converts DOC (Word 97-2003) files to Markdown. Uses unword package
as parser backend to extract body text with heading levels,
page breaks, and textbox contents.
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".doc",
feature="doc",
)
) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

try:
doc = unword.parse_doc(file_stream.read())
except Exception as e:
raise FileConversionException(f"Failed to parse .doc file: {e}") from e

title = getattr(doc, "title", None) or getattr(doc, "metadata", {}).get("title")
return DocumentConverterResult(markdown=doc.body_text.strip(), title=title)
27 changes: 25 additions & 2 deletions packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io
import re
import html
import hashlib

from typing import BinaryIO, Any
from operator import attrgetter
Expand Down Expand Up @@ -140,16 +141,38 @@ def get_shape_content(shape, **kwargs):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()

output_dir = kwargs.get("output_dir")

# If keep_data_uris is True, use base64 encoding for images
if kwargs.get("keep_data_uris", False):
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
#save image to disk to reference
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
ext_map = {"jpeg": "jpg", "svg+xml": "svg"}
raw_ext = content_type.split("/")[-1]
ext = ext_map.get(raw_ext, raw_ext)

#add filename collision handling
suffix = hashlib.md5(blob).hexdigest()[:8]
safe_name = re.sub(r"\W", "", shape.name) if shape.name else "image"
filename = f"{safe_name}_{suffix}.{ext}"

output_dir = kwargs.get("output_dir", ".")
image_path = os.path.join(output_dir, filename)

try:
with open(image_path, "wb") as img_file:
img_file.write(blob)
except OSError as e:
raise OSError(f"Failed to write image to '{image_path}'.") from e

md_content += "\n![" + alt_text + "](" + filename + ")\n"


# Tables
if self._is_table(shape):
Expand Down
18 changes: 17 additions & 1 deletion packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ class FileTestVector(object):
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
),
FileTestVector(
filename="test.doc",
mimetype="application/msword",
charset=None,
url=None,
must_include=[
"93d437af-bc31-492f-a7fc-3cbc9b7c1710",
"fd0ed3e3-6373-4446-815c-4b979f6063a9",
"#Test for converting .doc files to MD format",
"Let’s test it!",
],
must_not_include=[
"d0cf11e0",
"\x00"
],
),
FileTestVector(
filename="test.xlsx",
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
Expand Down Expand Up @@ -68,7 +84,7 @@ class FileTestVector(object):
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
"![This phrase of the caption is Human-written.](Picture4.jpg)",
"![This phrase of the caption is Human-written.](Picture4_",
],
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
),
Expand Down
Binary file added packages/markitdown/tests/test_files/test.doc
Binary file not shown.