diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..614022660 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -29,6 +29,7 @@ BingSerpConverter, PdfConverter, DocxConverter, + DocConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(DocConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..56475bdb6 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._doc_converter import DocConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter @@ -34,6 +35,7 @@ "BingSerpConverter", "PdfConverter", "DocxConverter", + "DocConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py new file mode 100644 index 000000000..cb4a71c0c --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -0,0 +1,73 @@ +import sys +from typing import Any, BinaryIO + +from .._stream_info import StreamInfo +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, FileConversionException + +#using unword dependency +_dependency_exc_info = None +unword = None +try: + import unword as _unword + unword = _unword +except ImportError: + _dependency_exc_info = sys.exc_info() + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/msword", "application/x-msword" +] + +ACCEPTED_FILE_EXTENSIONS = [".doc"] + + +class DocConverter(DocumentConverter): + """ + Converts DOC (Word 97-2003) files to Markdown. Uses unword package + as parser backend to extract body text with heading levels, + page breaks, and textbox contents. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".doc", + feature="doc", + ) + ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + try: + doc = unword.parse_doc(file_stream.read()) + except Exception as e: + raise FileConversionException(f"Failed to parse .doc file: {e}") from e + + title = getattr(doc, "title", None) or getattr(doc, "metadata", {}).get("title") + return DocumentConverterResult(markdown=doc.body_text.strip(), title=title) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 360f17706..22abadd39 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -4,6 +4,7 @@ import io import re import html +import hashlib from typing import BinaryIO, Any from operator import attrgetter @@ -140,6 +141,8 @@ def get_shape_content(shape, **kwargs): alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() + output_dir = kwargs.get("output_dir") + # If keep_data_uris is True, use base64 encoding for images if kwargs.get("keep_data_uris", False): blob = shape.image.blob @@ -147,9 +150,29 @@ def get_shape_content(shape, **kwargs): b64_string = base64.b64encode(blob).decode("utf-8") md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" else: - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" + #save image to disk to reference + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + ext_map = {"jpeg": "jpg", "svg+xml": "svg"} + raw_ext = content_type.split("/")[-1] + ext = ext_map.get(raw_ext, raw_ext) + + #add filename collision handling + suffix = hashlib.md5(blob).hexdigest()[:8] + safe_name = re.sub(r"\W", "", shape.name) if shape.name else "image" + filename = f"{safe_name}_{suffix}.{ext}" + + output_dir = kwargs.get("output_dir", ".") + image_path = os.path.join(output_dir, filename) + + try: + with open(image_path, "wb") as img_file: + img_file.write(blob) + except OSError as e: + raise OSError(f"Failed to write image to '{image_path}'.") from e + md_content += "\n![" + alt_text + "](" + filename + ")\n" + # Tables if self._is_table(shape): diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..82b824cbb 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -31,6 +31,22 @@ class FileTestVector(object): "data:image/png;base64,iVBORw0KGgoAAAANSU", ], ), + FileTestVector( + filename="test.doc", + mimetype="application/msword", + charset=None, + url=None, + must_include=[ + "93d437af-bc31-492f-a7fc-3cbc9b7c1710", + "fd0ed3e3-6373-4446-815c-4b979f6063a9", + "#Test for converting .doc files to MD format", + "Let’s test it!", + ], + must_not_include=[ + "d0cf11e0", + "\x00" + ], + ), FileTestVector( filename="test.xlsx", mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -68,7 +84,7 @@ class FileTestVector(object): "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value - "![This phrase of the caption is Human-written.](Picture4.jpg)", + "![This phrase of the caption is Human-written.](Picture4_", ], must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], ), diff --git a/packages/markitdown/tests/test_files/test.doc b/packages/markitdown/tests/test_files/test.doc new file mode 100644 index 000000000..cbba93947 Binary files /dev/null and b/packages/markitdown/tests/test_files/test.doc differ