Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
EpubConverter,
DocumentIntelligenceConverter,
CsvConverter,
WebVttConverter,
)

from ._base_converter import DocumentConverter, DocumentConverterResult
Expand Down Expand Up @@ -202,6 +203,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
self.register_converter(WebVttConverter())

# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from ._epub_converter import EpubConverter
from ._csv_converter import CsvConverter
from ._webvtt_converter import WebVttConverter

__all__ = [
"PlainTextConverter",
Expand All @@ -45,4 +46,5 @@
"DocumentIntelligenceFileType",
"EpubConverter",
"CsvConverter",
"WebVttConverter",
]
80 changes: 80 additions & 0 deletions packages/markitdown/src/markitdown/converters/_webvtt_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import html
import re
from typing import Any, BinaryIO

from charset_normalizer import from_bytes

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo

ACCEPTED_MIME_TYPES = ["text/vtt"]
ACCEPTED_FILE_EXTENSIONS = [".vtt"]

_TIMESTAMP_RE = re.compile(
r"^\s*(?:\d{2}:)?\d{2}:\d{2}\.\d{3}\s+-->\s+(?:\d{2}:)?\d{2}:\d{2}\.\d{3}"
)
_VOICE_OPEN_RE = re.compile(r"<v(?:\.[^>\s]+)?\s+([^>]+)>")
_TAG_RE = re.compile(r"<[^>]+>")


class WebVttConverter(DocumentConverter):
"""Convert WebVTT subtitle files to readable Markdown text."""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

return mimetype in ACCEPTED_MIME_TYPES or extension in ACCEPTED_FILE_EXTENSIONS

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if stream_info.charset:
text = file_stream.read().decode(stream_info.charset)
else:
text = str(from_bytes(file_stream.read()).best())

cues = []
for block in re.split(r"\r?\n\s*\r?\n", text):
cue = self._convert_block(block)
if cue:
cues.append(cue)

return DocumentConverterResult(markdown="\n\n".join(cues))

def _convert_block(self, block: str) -> str:
lines = [line.strip() for line in block.splitlines() if line.strip()]
if not lines:
return ""

first = lines[0].lstrip("\ufeff")
if first.startswith(("WEBVTT", "NOTE", "STYLE", "REGION")):
return ""

timestamp_index = next(
(i for i, line in enumerate(lines) if _TIMESTAMP_RE.match(line)),
None,
)
if timestamp_index is None:
return ""

text_lines = [
self._clean_text_line(line)
for line in lines[timestamp_index + 1 :]
if line.strip()
]
return "\n".join(line for line in text_lines if line)

def _clean_text_line(self, line: str) -> str:
line = _VOICE_OPEN_RE.sub(r"\1: ", line)
line = _TAG_RE.sub("", line)
line = html.unescape(line)
return re.sub(r"\s+", " ", line).strip()
31 changes: 31 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,37 @@ def test_exceptions() -> None:
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"


def test_webvtt_converter_outputs_clean_transcript(tmp_path) -> None:
vtt_file = tmp_path / "meeting.vtt"
vtt_file.write_text(
"""WEBVTT
Kind: captions

NOTE this should be ignored

1
00:00:01.000 --> 00:00:03.000 position:10%
<v Alice>Hello &amp; welcome</v>
to the meeting.

STYLE
::cue { color: lime; }

00:00:04.000 --> 00:00:06.000
<i>Next action item</i>
""",
encoding="utf-8",
)

result = MarkItDown().convert(str(vtt_file))

assert result.text_content == (
"Alice: Hello & welcome\n"
"to the meeting.\n\n"
"Next action item"
)


@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
Expand Down