diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..a207cd35d 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,7 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + WebVttConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -202,6 +203,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) + self.register_converter(WebVttConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..f5535562d 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -23,6 +23,7 @@ ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter +from ._webvtt_converter import WebVttConverter __all__ = [ "PlainTextConverter", @@ -45,4 +46,5 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "WebVttConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_webvtt_converter.py b/packages/markitdown/src/markitdown/converters/_webvtt_converter.py new file mode 100644 index 000000000..e98b3009a --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_webvtt_converter.py @@ -0,0 +1,80 @@ +import html +import re +from typing import Any, BinaryIO + +from charset_normalizer import from_bytes + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPES = ["text/vtt"] +ACCEPTED_FILE_EXTENSIONS = [".vtt"] + +_TIMESTAMP_RE = re.compile( + r"^\s*(?:\d{2}:)?\d{2}:\d{2}\.\d{3}\s+-->\s+(?:\d{2}:)?\d{2}:\d{2}\.\d{3}" +) +_VOICE_OPEN_RE = re.compile(r"\s]+)?\s+([^>]+)>") +_TAG_RE = re.compile(r"<[^>]+>") + + +class WebVttConverter(DocumentConverter): + """Convert WebVTT subtitle files to readable Markdown text.""" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + return mimetype in ACCEPTED_MIME_TYPES or extension in ACCEPTED_FILE_EXTENSIONS + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if stream_info.charset: + text = file_stream.read().decode(stream_info.charset) + else: + text = str(from_bytes(file_stream.read()).best()) + + cues = [] + for block in re.split(r"\r?\n\s*\r?\n", text): + cue = self._convert_block(block) + if cue: + cues.append(cue) + + return DocumentConverterResult(markdown="\n\n".join(cues)) + + def _convert_block(self, block: str) -> str: + lines = [line.strip() for line in block.splitlines() if line.strip()] + if not lines: + return "" + + first = lines[0].lstrip("\ufeff") + if first.startswith(("WEBVTT", "NOTE", "STYLE", "REGION")): + return "" + + timestamp_index = next( + (i for i, line in enumerate(lines) if _TIMESTAMP_RE.match(line)), + None, + ) + if timestamp_index is None: + return "" + + text_lines = [ + self._clean_text_line(line) + for line in lines[timestamp_index + 1 :] + if line.strip() + ] + return "\n".join(line for line in text_lines if line) + + def _clean_text_line(self, line: str) -> str: + line = _VOICE_OPEN_RE.sub(r"\1: ", line) + line = _TAG_RE.sub("", line) + line = html.unescape(line) + return re.sub(r"\s+", " ", line).strip() diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..5234b8c9a 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -432,6 +432,37 @@ def test_exceptions() -> None: assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" +def test_webvtt_converter_outputs_clean_transcript(tmp_path) -> None: + vtt_file = tmp_path / "meeting.vtt" + vtt_file.write_text( + """WEBVTT +Kind: captions + +NOTE this should be ignored + +1 +00:00:01.000 --> 00:00:03.000 position:10% +Hello & welcome +to the meeting. + +STYLE +::cue { color: lime; } + +00:00:04.000 --> 00:00:06.000 +Next action item +""", + encoding="utf-8", + ) + + result = MarkItDown().convert(str(vtt_file)) + + assert result.text_content == ( + "Alice: Hello & welcome\n" + "to the meeting.\n\n" + "Next action item" + ) + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed",