diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..1ef241ff4 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -34,6 +34,7 @@ PptxConverter, ImageConverter, AudioConverter, + EmlConverter, OutlookMsgConverter, ZipConverter, EpubConverter, @@ -199,6 +200,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) + self.register_converter(EmlConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..3503a8e9f 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -15,6 +15,7 @@ from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter from ._audio_converter import AudioConverter +from ._eml_converter import EmlConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import ( @@ -39,6 +40,7 @@ "PptxConverter", "ImageConverter", "AudioConverter", + "EmlConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py new file mode 100644 index 000000000..43a666626 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -0,0 +1,134 @@ +from email import policy +from email.message import EmailMessage, Message +from email.parser import BytesParser +from typing import Any, BinaryIO, Iterable + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from ._html_converter import HtmlConverter + +ACCEPTED_MIME_TYPES = [ + "message/rfc822", + "application/eml", +] + +ACCEPTED_FILE_EXTENSIONS = [".eml"] + + +class EmlConverter(DocumentConverter): + """Converts RFC 822 / EML email files to markdown.""" + + def __init__(self) -> None: + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + if mimetype in ACCEPTED_MIME_TYPES: + return True + + cur_pos = file_stream.tell() + try: + message = BytesParser(policy=policy.default).parsebytes( + file_stream.read(65536) + ) + return message.get("from") is not None and ( + message.get("to") is not None or message.get("subject") is not None + ) + finally: + file_stream.seek(cur_pos) + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + message = BytesParser(policy=policy.default).parse(file_stream) + headers = self._headers(message) + body = self._body_to_markdown(message) + attachments = self._attachments(message) + + md_content = "# Email Message\n\n" + for key, value in headers: + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + if body: + md_content += body.strip() + + if attachments: + md_content += "\n\n## Attachments\n\n" + for attachment in attachments: + md_content += f"- {attachment}\n" + + return DocumentConverterResult( + markdown=md_content.strip(), + title=message.get("subject"), + ) + + def _headers(self, message: Message) -> list[tuple[str, str | None]]: + return [ + ("From", message.get("from")), + ("To", message.get("to")), + ("Cc", message.get("cc")), + ("Date", message.get("date")), + ("Subject", message.get("subject")), + ] + + def _body_to_markdown(self, message: Message) -> str: + plain_parts: list[str] = [] + html_parts: list[str] = [] + + for part in self._iter_body_parts(message): + content_type = part.get_content_type() + content = part.get_content() + if not isinstance(content, str): + continue + + if content_type == "text/plain": + plain_parts.append(content) + elif content_type == "text/html": + html_parts.append( + self._html_converter.convert_string(content).markdown + ) + + if plain_parts: + return "\n\n".join(part.strip() for part in plain_parts if part.strip()) + + return "\n\n".join(part.strip() for part in html_parts if part.strip()) + + def _iter_body_parts(self, message: Message) -> Iterable[EmailMessage]: + if message.is_multipart(): + for part in message.walk(): + if part.is_multipart(): + continue + if part.get_content_disposition() == "attachment": + continue + if part.get_content_type() in ("text/plain", "text/html"): + yield part # type: ignore[misc] + elif message.get_content_type() in ("text/plain", "text/html"): + yield message # type: ignore[misc] + + def _attachments(self, message: Message) -> list[str]: + attachments: list[str] = [] + + for part in message.walk() if message.is_multipart() else []: + if part.get_content_disposition() != "attachment": + continue + + filename = part.get_filename() or "unnamed attachment" + content_type = part.get_content_type() + attachments.append(f"{filename} ({content_type})") + + return attachments diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..4fe62c4c4 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -87,6 +87,29 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test_email.eml", + mimetype="message/rfc822", + charset="utf-8", + url=None, + must_include=[ + "# Email Message", + "**From:** Test Sender ", + "**To:** Test Recipient ", + "**Cc:** Copy Recipient ", + "**Date:** Thu, 14 May 2026 09:50:00 +0800", + "**Subject:** Test EML Message", + "## Content", + "This is the plain text body of the EML test message.", + "It should be preferred over the HTML alternative.", + "## Attachments", + "- notes.txt (text/plain)", + ], + must_not_include=[ + "This HTML body should not be used.", + "attachment content should not be inlined", + ], + ), FileTestVector( filename="test.pdf", mimetype="application/pdf", diff --git a/packages/markitdown/tests/test_files/test_email.eml b/packages/markitdown/tests/test_files/test_email.eml new file mode 100644 index 000000000..9d8f0b274 --- /dev/null +++ b/packages/markitdown/tests/test_files/test_email.eml @@ -0,0 +1,27 @@ +From: Test Sender +To: Test Recipient +Cc: Copy Recipient +Date: Thu, 14 May 2026 09:50:00 +0800 +Subject: Test EML Message +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="markitdown-test-boundary" + +--markitdown-test-boundary +Content-Type: text/plain; charset="utf-8" + +This is the plain text body of the EML test message. + +It should be preferred over the HTML alternative. + +--markitdown-test-boundary +Content-Type: text/html; charset="utf-8" + +

This HTML body should not be used.

+ +--markitdown-test-boundary +Content-Type: text/plain; name="notes.txt" +Content-Disposition: attachment; filename="notes.txt" + +attachment content should not be inlined + +--markitdown-test-boundary--