Text & Transcription

Classes for handling transcriptions and burning subtitles onto video.

Transcription Classes

Transcription

Source code in src/videopython/base/transcription.py

class Transcription:
    def __init__(
        self,
        segments: list[TranscriptionSegment] | None = None,
        words: list[TranscriptionWord] | None = None,
        language: str | None = None,
    ):
        """Initialize Transcription from either segments or words.

        Args:
            segments: Pre-constructed segments (backward compatible)
            words: Words to group into segments by speaker (for diarization)
            language: ISO 639-1 language code detected during transcription (e.g. "en", "pl")

        Raises:
            ValueError: If both or neither arguments are provided
        """
        if (segments is None) == (words is None):
            raise ValueError("Exactly one of 'segments' or 'words' must be provided")

        self.language = language

        if segments is not None:
            self.segments = segments
            self.speakers = {s.speaker for s in segments if s.speaker is not None}
        else:
            assert words is not None
            self.segments = self._words_to_segments(words)
            self.speakers = {w.speaker for w in words if w.speaker is not None}

    @property
    def words(self) -> list[TranscriptionWord]:
        """Return all words from all segments."""
        all_words = []
        for segment in self.segments:
            all_words.extend(segment.words)
        return all_words

    def _words_to_segments(self, words: list[TranscriptionWord]) -> list[TranscriptionSegment]:
        """Group words into segments based on speaker changes."""
        if not words:
            return []

        current_speaker = words[0].speaker
        current_words: list[TranscriptionWord] = []
        segments = []

        for word in words:
            if current_speaker == word.speaker:
                current_words.append(word)
            else:
                segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                current_speaker = word.speaker
                current_words = [word]

        if current_words:
            segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))

        return segments

    def speaker_stats(self) -> dict[str, float]:
        """Calculate speaking time percentage for each speaker.

        Returns:
            Dictionary mapping speaker names to their percentage of total speaking time
        """
        all_words = []
        for segment in self.segments:
            all_words.extend(segment.words)

        speaking_stats: dict[str, float] = {speaker: 0.0 for speaker in self.speakers}
        total_speaking_time = 0.0

        for word in all_words:
            if word.speaker is not None:
                speak_time = word.end - word.start
                total_speaking_time += speak_time
                speaking_stats[word.speaker] += speak_time

        if total_speaking_time > 0:
            for speaker in speaking_stats:
                speaking_stats[speaker] /= total_speaking_time

        return speaking_stats

    def offset(self, time: float) -> Transcription:
        """Return a new Transcription with all timings offset by the provided time value."""
        offset_segments = []

        for segment in self.segments:
            offset_words = [
                TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
                for w in segment.words
            ]
            # ``replace`` carries text, speaker, and confidence fields through a
            # pure timing shift unchanged -- only timestamps move.
            offset_segments.append(
                replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
            )

        return Transcription(segments=offset_segments, language=self.language)

    def standardize_segments(self, *, time: float | None = None, num_words: int | None = None) -> Transcription:
        """Return a new Transcription with standardized segments.

        Segments are also split on speaker changes so that each segment contains
        words from a single speaker.

        Args:
            time: Maximum duration in seconds for each segment
            num_words: Maximum number of words per segment

        Raises:
            ValueError: If both time and num_words are provided or if neither is provided
        """
        if (time is None) == (num_words is None):
            raise ValueError("Exactly one of 'time' or 'num_words' must be provided")

        if time is not None and time <= 0:
            raise ValueError("Time must be positive")

        if num_words is not None and num_words <= 0:
            raise ValueError("Number of words must be positive")

        # Collect all words from all segments
        all_words: list[TranscriptionWord] = []
        for segment in self.segments:
            all_words.extend(segment.words)

        if not all_words:
            return Transcription(segments=[], language=self.language)

        standardized_segments: list[TranscriptionSegment] = []

        def _flush(words: list[TranscriptionWord]) -> None:
            if not words:
                return
            # Words here are regrouped across original segments, so the source
            # segments' confidence fields no longer apply -- left as None.
            standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))

        if time is not None:
            current_words: list[TranscriptionWord] = []

            for word in all_words:
                if not current_words:
                    current_words = [word]
                elif word.speaker != current_words[0].speaker or word.end - current_words[0].start > time:
                    _flush(current_words)
                    current_words = [word]
                else:
                    current_words.append(word)

            _flush(current_words)

        elif num_words is not None:
            current_words = []

            for word in all_words:
                if not current_words:
                    current_words = [word]
                elif word.speaker != current_words[0].speaker or len(current_words) >= num_words:
                    _flush(current_words)
                    current_words = [word]
                else:
                    current_words.append(word)

            _flush(current_words)

        return Transcription(segments=standardized_segments, language=self.language)

    def capitalize_sentences(self) -> Transcription:
        """Return a new Transcription with sentence-start capitalization.

        The first letter of the first spoken word and of every word that
        follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
        upper-cased. Remaining characters are left untouched, so acronyms and
        proper nouns from the source transcription are preserved. Timing,
        speaker, and language are carried through unchanged.

        Abbreviation detection is intentionally not attempted: a token like
        ``"U.S."`` is treated as a sentence end. This heuristic is adequate
        for burned-in subtitles and avoids a brittle abbreviation list.
        """
        capitalized_segments: list[TranscriptionSegment] = []
        start_of_sentence = True

        for segment in self.segments:
            new_words: list[TranscriptionWord] = []
            for word in segment.words:
                token = word.word
                if start_of_sentence:
                    idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
                    if idx is not None:
                        token = token[:idx] + token[idx].upper() + token[idx + 1 :]
                        start_of_sentence = False
                if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
                    start_of_sentence = True
                new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))

            # Casing-only rewrite: segment boundaries, speaker, and confidence
            # are unchanged; only the tokens (and joined text) differ.
            capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))

        return Transcription(segments=capitalized_segments, language=self.language)

    def chunk_segments(self, max_words: int) -> Transcription:
        """Return a new Transcription splitting each segment into smaller cues.

        Each segment is split into consecutive groups of at most ``max_words``
        words, using that group's own first/last word timings. Unlike
        :meth:`standardize_segments`, words are never merged across the
        original segments, so silence gaps between segments are preserved and
        subtitles do not linger over pauses. Speaker, confidence, and language
        metadata are carried through unchanged.

        Args:
            max_words: Maximum number of words per output segment.

        Raises:
            ValueError: If ``max_words`` is not positive.
        """
        if max_words <= 0:
            raise ValueError("max_words must be positive")

        chunked_segments: list[TranscriptionSegment] = []
        for segment in self.segments:
            words = segment.words
            if not words:
                # Nothing to split; emit a fresh copy so the result never
                # aliases the source segment.
                chunked_segments.append(replace(segment, words=list(segment.words)))
                continue
            for i in range(0, len(words), max_words):
                group = words[i : i + max_words]
                # Splitting *within* one source segment -- its confidence
                # fields still apply, so carry them through.
                chunked_segments.append(
                    TranscriptionSegment.from_words(
                        group,
                        speaker=segment.speaker,
                        avg_logprob=segment.avg_logprob,
                        no_speech_prob=segment.no_speech_prob,
                        compression_ratio=segment.compression_ratio,
                    )
                )

        return Transcription(segments=chunked_segments, language=self.language)

    def slice(self, start: float, end: float) -> Transcription | None:
        """Return a new Transcription containing only words within the time range.

        Slices at word-level granularity: words that overlap with the time range
        are included, and new segments are reconstructed from the included words.

        Args:
            start: Start time in seconds (inclusive)
            end: End time in seconds (exclusive)

        Returns:
            New Transcription with words/segments in the time range, or None if no words overlap
        """
        if start >= end:
            return None

        # Collect all words that overlap with the time range
        overlapping_words: list[TranscriptionWord] = []
        for segment in self.segments:
            for word in segment.words:
                # Include word if it overlaps with our time range
                if word.end > start and word.start < end:
                    overlapping_words.append(word)

        if not overlapping_words:
            return None

        # Reconstruct segments from the overlapping words
        # Group consecutive words by speaker to form segments
        sliced_segments: list[TranscriptionSegment] = []
        current_speaker = overlapping_words[0].speaker
        current_words: list[TranscriptionWord] = []

        for word in overlapping_words:
            if word.speaker == current_speaker:
                current_words.append(word)
            else:
                # Finish current segment (speaker is ambiguous across the
                # original segments these words came from -- confidence omitted)
                if current_words:
                    sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                # Start new segment
                current_speaker = word.speaker
                current_words = [word]

        # Add final segment
        if current_words:
            sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))

        return Transcription(segments=sliced_segments, language=self.language)

    @staticmethod
    def _format_srt_time(seconds: float) -> str:
        """Format seconds as SRT timestamp (HH:MM:SS,mmm)."""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millis = int(round((seconds - int(seconds)) * 1000))
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

    @staticmethod
    def _parse_srt_time(timestamp: str) -> float:
        """Parse SRT timestamp (HH:MM:SS,mmm) to seconds."""
        hours, minutes, rest = timestamp.strip().split(":")
        seconds, millis = rest.split(",")
        return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000

    def to_srt(self) -> str:
        """Export transcription as an SRT subtitle string."""
        blocks = []
        for i, segment in enumerate(self.segments, start=1):
            start = self._format_srt_time(segment.start)
            end = self._format_srt_time(segment.end)
            blocks.append(f"{i}\n{start} --> {end}\n{segment.text}")
        return "\n\n".join(blocks) + "\n" if blocks else ""

    @classmethod
    def from_srt(cls, srt: str) -> Transcription:
        """Parse an SRT string into a Transcription.

        Each SRT block becomes a segment with a single word spanning the full
        segment duration (word-level timing is not available in SRT).

        Args:
            srt: SRT-formatted string.

        Returns:
            Transcription with one segment per SRT block.
        """
        segments: list[TranscriptionSegment] = []
        blocks = [b.strip() for b in srt.strip().split("\n\n") if b.strip()]

        for block in blocks:
            lines = block.split("\n")
            # SRT block: index, timestamp line, one or more text lines
            if len(lines) < 3:
                continue
            timestamp_line = lines[1]
            start_str, end_str = timestamp_line.split("-->")
            start = cls._parse_srt_time(start_str)
            end = cls._parse_srt_time(end_str)
            text = "\n".join(lines[2:]).strip()

            words = [TranscriptionWord(start=start, end=end, word=text)]
            segments.append(TranscriptionSegment(start=start, end=end, text=text, words=words))

        return cls(segments=segments)

    def save_srt(self, path: str | Path) -> None:
        """Write transcription to an SRT file.

        Args:
            path: Output file path.
        """
        Path(path).write_text(self.to_srt(), encoding="utf-8")

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            "segments": [s.to_dict() for s in self.segments],
            "language": self.language,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Transcription:
        """Create Transcription from dictionary."""
        return cls(
            segments=[TranscriptionSegment.from_dict(s) for s in data["segments"]],
            language=data.get("language"),
        )

words `property`

words: list[TranscriptionWord]

Return all words from all segments.

init

__init__(
    segments: list[TranscriptionSegment] | None = None,
    words: list[TranscriptionWord] | None = None,
    language: str | None = None,
)

Initialize Transcription from either segments or words.

Parameters:

Name	Type	Description	Default
`segments`	`list[TranscriptionSegment] \| None`	Pre-constructed segments (backward compatible)	`None`
`words`	`list[TranscriptionWord] \| None`	Words to group into segments by speaker (for diarization)	`None`
`language`	`str \| None`	ISO 639-1 language code detected during transcription (e.g. "en", "pl")	`None`

Raises:

Type	Description
`ValueError`	If both or neither arguments are provided

Source code in src/videopython/base/transcription.py

def __init__(
    self,
    segments: list[TranscriptionSegment] | None = None,
    words: list[TranscriptionWord] | None = None,
    language: str | None = None,
):
    """Initialize Transcription from either segments or words.

    Args:
        segments: Pre-constructed segments (backward compatible)
        words: Words to group into segments by speaker (for diarization)
        language: ISO 639-1 language code detected during transcription (e.g. "en", "pl")

    Raises:
        ValueError: If both or neither arguments are provided
    """
    if (segments is None) == (words is None):
        raise ValueError("Exactly one of 'segments' or 'words' must be provided")

    self.language = language

    if segments is not None:
        self.segments = segments
        self.speakers = {s.speaker for s in segments if s.speaker is not None}
    else:
        assert words is not None
        self.segments = self._words_to_segments(words)
        self.speakers = {w.speaker for w in words if w.speaker is not None}

speaker_stats

speaker_stats() -> dict[str, float]

Calculate speaking time percentage for each speaker.

Returns:

Type	Description
`dict[str, float]`	Dictionary mapping speaker names to their percentage of total speaking time

Source code in src/videopython/base/transcription.py

def speaker_stats(self) -> dict[str, float]:
    """Calculate speaking time percentage for each speaker.

    Returns:
        Dictionary mapping speaker names to their percentage of total speaking time
    """
    all_words = []
    for segment in self.segments:
        all_words.extend(segment.words)

    speaking_stats: dict[str, float] = {speaker: 0.0 for speaker in self.speakers}
    total_speaking_time = 0.0

    for word in all_words:
        if word.speaker is not None:
            speak_time = word.end - word.start
            total_speaking_time += speak_time
            speaking_stats[word.speaker] += speak_time

    if total_speaking_time > 0:
        for speaker in speaking_stats:
            speaking_stats[speaker] /= total_speaking_time

    return speaking_stats

offset

offset(time: float) -> Transcription

Return a new Transcription with all timings offset by the provided time value.

Source code in src/videopython/base/transcription.py

def offset(self, time: float) -> Transcription:
    """Return a new Transcription with all timings offset by the provided time value."""
    offset_segments = []

    for segment in self.segments:
        offset_words = [
            TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
            for w in segment.words
        ]
        # ``replace`` carries text, speaker, and confidence fields through a
        # pure timing shift unchanged -- only timestamps move.
        offset_segments.append(
            replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
        )

    return Transcription(segments=offset_segments, language=self.language)

standardize_segments

standardize_segments(
    *,
    time: float | None = None,
    num_words: int | None = None,
) -> Transcription

Return a new Transcription with standardized segments.

Segments are also split on speaker changes so that each segment contains words from a single speaker.

Parameters:

Name	Type	Description	Default
`time`	`float \| None`	Maximum duration in seconds for each segment	`None`
`num_words`	`int \| None`	Maximum number of words per segment	`None`

Raises:

Type	Description
`ValueError`	If both time and num_words are provided or if neither is provided

Source code in src/videopython/base/transcription.py

def standardize_segments(self, *, time: float | None = None, num_words: int | None = None) -> Transcription:
    """Return a new Transcription with standardized segments.

    Segments are also split on speaker changes so that each segment contains
    words from a single speaker.

    Args:
        time: Maximum duration in seconds for each segment
        num_words: Maximum number of words per segment

    Raises:
        ValueError: If both time and num_words are provided or if neither is provided
    """
    if (time is None) == (num_words is None):
        raise ValueError("Exactly one of 'time' or 'num_words' must be provided")

    if time is not None and time <= 0:
        raise ValueError("Time must be positive")

    if num_words is not None and num_words <= 0:
        raise ValueError("Number of words must be positive")

    # Collect all words from all segments
    all_words: list[TranscriptionWord] = []
    for segment in self.segments:
        all_words.extend(segment.words)

    if not all_words:
        return Transcription(segments=[], language=self.language)

    standardized_segments: list[TranscriptionSegment] = []

    def _flush(words: list[TranscriptionWord]) -> None:
        if not words:
            return
        # Words here are regrouped across original segments, so the source
        # segments' confidence fields no longer apply -- left as None.
        standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))

    if time is not None:
        current_words: list[TranscriptionWord] = []

        for word in all_words:
            if not current_words:
                current_words = [word]
            elif word.speaker != current_words[0].speaker or word.end - current_words[0].start > time:
                _flush(current_words)
                current_words = [word]
            else:
                current_words.append(word)

        _flush(current_words)

    elif num_words is not None:
        current_words = []

        for word in all_words:
            if not current_words:
                current_words = [word]
            elif word.speaker != current_words[0].speaker or len(current_words) >= num_words:
                _flush(current_words)
                current_words = [word]
            else:
                current_words.append(word)

        _flush(current_words)

    return Transcription(segments=standardized_segments, language=self.language)

capitalize_sentences

capitalize_sentences() -> Transcription

Return a new Transcription with sentence-start capitalization.

The first letter of the first spoken word and of every word that follows sentence-ending punctuation (., !, ?, …) is upper-cased. Remaining characters are left untouched, so acronyms and proper nouns from the source transcription are preserved. Timing, speaker, and language are carried through unchanged.

Abbreviation detection is intentionally not attempted: a token like "U.S." is treated as a sentence end. This heuristic is adequate for burned-in subtitles and avoids a brittle abbreviation list.

Source code in src/videopython/base/transcription.py

def capitalize_sentences(self) -> Transcription:
    """Return a new Transcription with sentence-start capitalization.

    The first letter of the first spoken word and of every word that
    follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
    upper-cased. Remaining characters are left untouched, so acronyms and
    proper nouns from the source transcription are preserved. Timing,
    speaker, and language are carried through unchanged.

    Abbreviation detection is intentionally not attempted: a token like
    ``"U.S."`` is treated as a sentence end. This heuristic is adequate
    for burned-in subtitles and avoids a brittle abbreviation list.
    """
    capitalized_segments: list[TranscriptionSegment] = []
    start_of_sentence = True

    for segment in self.segments:
        new_words: list[TranscriptionWord] = []
        for word in segment.words:
            token = word.word
            if start_of_sentence:
                idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
                if idx is not None:
                    token = token[:idx] + token[idx].upper() + token[idx + 1 :]
                    start_of_sentence = False
            if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
                start_of_sentence = True
            new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))

        # Casing-only rewrite: segment boundaries, speaker, and confidence
        # are unchanged; only the tokens (and joined text) differ.
        capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))

    return Transcription(segments=capitalized_segments, language=self.language)

chunk_segments

chunk_segments(max_words: int) -> Transcription

Return a new Transcription splitting each segment into smaller cues.

Each segment is split into consecutive groups of at most max_words words, using that group's own first/last word timings. Unlike :meth:standardize_segments, words are never merged across the original segments, so silence gaps between segments are preserved and subtitles do not linger over pauses. Speaker, confidence, and language metadata are carried through unchanged.

Parameters:

Name	Type	Description	Default
`max_words`	`int`	Maximum number of words per output segment.	required

Raises:

Type	Description
`ValueError`	If `max_words` is not positive.

Source code in src/videopython/base/transcription.py

def chunk_segments(self, max_words: int) -> Transcription:
    """Return a new Transcription splitting each segment into smaller cues.

    Each segment is split into consecutive groups of at most ``max_words``
    words, using that group's own first/last word timings. Unlike
    :meth:`standardize_segments`, words are never merged across the
    original segments, so silence gaps between segments are preserved and
    subtitles do not linger over pauses. Speaker, confidence, and language
    metadata are carried through unchanged.

    Args:
        max_words: Maximum number of words per output segment.

    Raises:
        ValueError: If ``max_words`` is not positive.
    """
    if max_words <= 0:
        raise ValueError("max_words must be positive")

    chunked_segments: list[TranscriptionSegment] = []
    for segment in self.segments:
        words = segment.words
        if not words:
            # Nothing to split; emit a fresh copy so the result never
            # aliases the source segment.
            chunked_segments.append(replace(segment, words=list(segment.words)))
            continue
        for i in range(0, len(words), max_words):
            group = words[i : i + max_words]
            # Splitting *within* one source segment -- its confidence
            # fields still apply, so carry them through.
            chunked_segments.append(
                TranscriptionSegment.from_words(
                    group,
                    speaker=segment.speaker,
                    avg_logprob=segment.avg_logprob,
                    no_speech_prob=segment.no_speech_prob,
                    compression_ratio=segment.compression_ratio,
                )
            )

    return Transcription(segments=chunked_segments, language=self.language)

slice

slice(start: float, end: float) -> Transcription | None

Return a new Transcription containing only words within the time range.

Slices at word-level granularity: words that overlap with the time range are included, and new segments are reconstructed from the included words.

Parameters:

Name	Type	Description	Default
`start`	`float`	Start time in seconds (inclusive)	required
`end`	`float`	End time in seconds (exclusive)	required

Returns:

Type	Description
`Transcription \| None`	New Transcription with words/segments in the time range, or None if no words overlap

Source code in src/videopython/base/transcription.py

def slice(self, start: float, end: float) -> Transcription | None:
    """Return a new Transcription containing only words within the time range.

    Slices at word-level granularity: words that overlap with the time range
    are included, and new segments are reconstructed from the included words.

    Args:
        start: Start time in seconds (inclusive)
        end: End time in seconds (exclusive)

    Returns:
        New Transcription with words/segments in the time range, or None if no words overlap
    """
    if start >= end:
        return None

    # Collect all words that overlap with the time range
    overlapping_words: list[TranscriptionWord] = []
    for segment in self.segments:
        for word in segment.words:
            # Include word if it overlaps with our time range
            if word.end > start and word.start < end:
                overlapping_words.append(word)

    if not overlapping_words:
        return None

    # Reconstruct segments from the overlapping words
    # Group consecutive words by speaker to form segments
    sliced_segments: list[TranscriptionSegment] = []
    current_speaker = overlapping_words[0].speaker
    current_words: list[TranscriptionWord] = []

    for word in overlapping_words:
        if word.speaker == current_speaker:
            current_words.append(word)
        else:
            # Finish current segment (speaker is ambiguous across the
            # original segments these words came from -- confidence omitted)
            if current_words:
                sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
            # Start new segment
            current_speaker = word.speaker
            current_words = [word]

    # Add final segment
    if current_words:
        sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))

    return Transcription(segments=sliced_segments, language=self.language)

to_srt

to_srt() -> str

Export transcription as an SRT subtitle string.

Source code in src/videopython/base/transcription.py

def to_srt(self) -> str:
    """Export transcription as an SRT subtitle string."""
    blocks = []
    for i, segment in enumerate(self.segments, start=1):
        start = self._format_srt_time(segment.start)
        end = self._format_srt_time(segment.end)
        blocks.append(f"{i}\n{start} --> {end}\n{segment.text}")
    return "\n\n".join(blocks) + "\n" if blocks else ""

from_srt `classmethod`

from_srt(srt: str) -> Transcription

Parse an SRT string into a Transcription.

Each SRT block becomes a segment with a single word spanning the full segment duration (word-level timing is not available in SRT).

Parameters:

Name	Type	Description	Default
`srt`	`str`	SRT-formatted string.	required

Returns:

Type	Description
`Transcription`	Transcription with one segment per SRT block.

Source code in src/videopython/base/transcription.py

@classmethod
def from_srt(cls, srt: str) -> Transcription:
    """Parse an SRT string into a Transcription.

    Each SRT block becomes a segment with a single word spanning the full
    segment duration (word-level timing is not available in SRT).

    Args:
        srt: SRT-formatted string.

    Returns:
        Transcription with one segment per SRT block.
    """
    segments: list[TranscriptionSegment] = []
    blocks = [b.strip() for b in srt.strip().split("\n\n") if b.strip()]

    for block in blocks:
        lines = block.split("\n")
        # SRT block: index, timestamp line, one or more text lines
        if len(lines) < 3:
            continue
        timestamp_line = lines[1]
        start_str, end_str = timestamp_line.split("-->")
        start = cls._parse_srt_time(start_str)
        end = cls._parse_srt_time(end_str)
        text = "\n".join(lines[2:]).strip()

        words = [TranscriptionWord(start=start, end=end, word=text)]
        segments.append(TranscriptionSegment(start=start, end=end, text=text, words=words))

    return cls(segments=segments)

save_srt

save_srt(path: str | Path) -> None

Write transcription to an SRT file.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Output file path.	required

Source code in src/videopython/base/transcription.py

def save_srt(self, path: str | Path) -> None:
    """Write transcription to an SRT file.

    Args:
        path: Output file path.
    """
    Path(path).write_text(self.to_srt(), encoding="utf-8")

to_dict

to_dict() -> dict[str, Any]

Convert to dictionary for JSON serialization.

Source code in src/videopython/base/transcription.py

def to_dict(self) -> dict[str, Any]:
    """Convert to dictionary for JSON serialization."""
    return {
        "segments": [s.to_dict() for s in self.segments],
        "language": self.language,
    }

from_dict `classmethod`

from_dict(data: dict[str, Any]) -> Transcription

Create Transcription from dictionary.

Source code in src/videopython/base/transcription.py

@classmethod
def from_dict(cls, data: dict[str, Any]) -> Transcription:
    """Create Transcription from dictionary."""
    return cls(
        segments=[TranscriptionSegment.from_dict(s) for s in data["segments"]],
        language=data.get("language"),
    )

TranscriptionSegment

TranscriptionSegment `dataclass`

Source code in src/videopython/base/transcription.py

@dataclass
class TranscriptionSegment:
    start: float
    end: float
    text: str
    words: list[TranscriptionWord]
    speaker: str | None = None
    avg_logprob: float | None = None
    no_speech_prob: float | None = None
    compression_ratio: float | None = None

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            "start": self.start,
            "end": self.end,
            "text": self.text,
            "words": [w.to_dict() for w in self.words],
            "speaker": self.speaker,
            "avg_logprob": self.avg_logprob,
            "no_speech_prob": self.no_speech_prob,
            "compression_ratio": self.compression_ratio,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> TranscriptionSegment:
        """Create TranscriptionSegment from dictionary."""
        return cls(
            start=data["start"],
            end=data["end"],
            text=data["text"],
            words=[TranscriptionWord.from_dict(w) for w in data["words"]],
            speaker=data.get("speaker"),
            avg_logprob=data.get("avg_logprob"),
            no_speech_prob=data.get("no_speech_prob"),
            compression_ratio=data.get("compression_ratio"),
        )

    @classmethod
    def from_words(
        cls,
        words: list[TranscriptionWord],
        *,
        speaker: str | None = None,
        avg_logprob: float | None = None,
        no_speech_prob: float | None = None,
        compression_ratio: float | None = None,
    ) -> TranscriptionSegment:
        """Build a segment spanning ``words``, deriving start/end/text from them.

        ``words`` must be non-empty: ``start``/``end`` come from the first/last
        word and ``text`` is the words joined by single spaces. Speaker and the
        confidence fields are passed through so callers re-segmenting *within* a
        known source segment can preserve them; callers regrouping words across
        segments (where these are ambiguous) simply omit them, leaving ``None``.
        The ``words`` list is copied, so the result never aliases the caller's.
        """
        if not words:
            raise ValueError("from_words requires a non-empty word list")
        return cls(
            start=words[0].start,
            end=words[-1].end,
            text=" ".join(w.word for w in words),
            words=list(words),
            speaker=speaker,
            avg_logprob=avg_logprob,
            no_speech_prob=no_speech_prob,
            compression_ratio=compression_ratio,
        )

to_dict

to_dict() -> dict[str, Any]

Convert to dictionary for JSON serialization.

Source code in src/videopython/base/transcription.py

def to_dict(self) -> dict[str, Any]:
    """Convert to dictionary for JSON serialization."""
    return {
        "start": self.start,
        "end": self.end,
        "text": self.text,
        "words": [w.to_dict() for w in self.words],
        "speaker": self.speaker,
        "avg_logprob": self.avg_logprob,
        "no_speech_prob": self.no_speech_prob,
        "compression_ratio": self.compression_ratio,
    }

from_dict `classmethod`

from_dict(data: dict[str, Any]) -> TranscriptionSegment

Create TranscriptionSegment from dictionary.

Source code in src/videopython/base/transcription.py

@classmethod
def from_dict(cls, data: dict[str, Any]) -> TranscriptionSegment:
    """Create TranscriptionSegment from dictionary."""
    return cls(
        start=data["start"],
        end=data["end"],
        text=data["text"],
        words=[TranscriptionWord.from_dict(w) for w in data["words"]],
        speaker=data.get("speaker"),
        avg_logprob=data.get("avg_logprob"),
        no_speech_prob=data.get("no_speech_prob"),
        compression_ratio=data.get("compression_ratio"),
    )

from_words `classmethod`

from_words(
    words: list[TranscriptionWord],
    *,
    speaker: str | None = None,
    avg_logprob: float | None = None,
    no_speech_prob: float | None = None,
    compression_ratio: float | None = None,
) -> TranscriptionSegment

Build a segment spanning words, deriving start/end/text from them.

words must be non-empty: start/end come from the first/last word and text is the words joined by single spaces. Speaker and the confidence fields are passed through so callers re-segmenting within a known source segment can preserve them; callers regrouping words across segments (where these are ambiguous) simply omit them, leaving None. The words list is copied, so the result never aliases the caller's.

Source code in src/videopython/base/transcription.py

@classmethod
def from_words(
    cls,
    words: list[TranscriptionWord],
    *,
    speaker: str | None = None,
    avg_logprob: float | None = None,
    no_speech_prob: float | None = None,
    compression_ratio: float | None = None,
) -> TranscriptionSegment:
    """Build a segment spanning ``words``, deriving start/end/text from them.

    ``words`` must be non-empty: ``start``/``end`` come from the first/last
    word and ``text`` is the words joined by single spaces. Speaker and the
    confidence fields are passed through so callers re-segmenting *within* a
    known source segment can preserve them; callers regrouping words across
    segments (where these are ambiguous) simply omit them, leaving ``None``.
    The ``words`` list is copied, so the result never aliases the caller's.
    """
    if not words:
        raise ValueError("from_words requires a non-empty word list")
    return cls(
        start=words[0].start,
        end=words[-1].end,
        text=" ".join(w.word for w in words),
        words=list(words),
        speaker=speaker,
        avg_logprob=avg_logprob,
        no_speech_prob=no_speech_prob,
        compression_ratio=compression_ratio,
    )

TranscriptionWord

TranscriptionWord `dataclass`

Source code in src/videopython/base/transcription.py

@dataclass
class TranscriptionWord:
    start: float
    end: float
    word: str
    speaker: str | None = None

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            "start": self.start,
            "end": self.end,
            "word": self.word,
            "speaker": self.speaker,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> TranscriptionWord:
        """Create TranscriptionWord from dictionary."""
        return cls(
            start=data["start"],
            end=data["end"],
            word=data["word"],
            speaker=data.get("speaker"),
        )

to_dict

to_dict() -> dict[str, Any]

Convert to dictionary for JSON serialization.

Source code in src/videopython/base/transcription.py

def to_dict(self) -> dict[str, Any]:
    """Convert to dictionary for JSON serialization."""
    return {
        "start": self.start,
        "end": self.end,
        "word": self.word,
        "speaker": self.speaker,
    }

from_dict `classmethod`

from_dict(data: dict[str, Any]) -> TranscriptionWord

Create TranscriptionWord from dictionary.

Source code in src/videopython/base/transcription.py

@classmethod
def from_dict(cls, data: dict[str, Any]) -> TranscriptionWord:
    """Create TranscriptionWord from dictionary."""
    return cls(
        start=data["start"],
        end=data["end"],
        word=data["word"],
        speaker=data.get("speaker"),
    )

Overlay Classes

TranscriptionOverlay

Render transcriptions as subtitles with word-level highlighting. The add_subtitles op (class TranscriptionOverlay) runs through the streaming engine, so it executes inside a VideoEdit rather than against a Video directly. It declares requires=("transcription",); pass the transcription via the context argument to run_to_file:

from videopython.editing import VideoEdit

# transcription = ... (from AudioToText or manually created)

edit = VideoEdit.from_dict(
    {
        "segments": [
            {
                "source": "input.mp4",
                "start": 0.0,
                "end": 5.0,
                "operations": [
                    {
                        "op": "add_subtitles",
                        "style": "boxed",   # boxed | outline | clean | karaoke
                        "region": "bottom", # top | center | bottom
                        "font_scale": 0.055,  # font height as a fraction of frame height
                        # "font": "poppins-bold",  # optional bundled font; omit for default
                    }
                ],
            }
        ]
    }
)
edit.run_to_file("output.mp4", context={"transcription": transcription})

Geometry is resolution-independent by default: font_scale/region are fractions of the frame, so the same overlay renders correctly at any output size. The absolute fields (font_size, position, box_width, explicit colors, ...) remain optional advanced overrides -- leave them unset to derive from the style/region/font_scale presets. Rendering is done by libass (ffmpeg's subtitles= filter) from a compile-time ASS document: native speed, and long cues wrap within the box instead of failing to fit.

TranscriptionOverlay

Bases: Effect

Renders animated word-by-word subtitles with the current word highlighted.

Each word lights up in the highlight color (enlarged by the size multiplier) as it is spoken, based on transcription timestamps. Requires a word-level transcription, which the runner supplies via the requires=("transcription",) declaration -- re-based onto the segment's local timeline and delivered at plan-compile time through :class:FilterCtx; the op compiles to a libass subtitles= filter (:attr:compiles_to_filter), so subtitled edits run on the O(1)-memory streaming path at native speed.

Source code in src/videopython/editing/transcription_overlay.py

class TranscriptionOverlay(Effect):
    """Renders animated word-by-word subtitles with the current word highlighted.

    Each word lights up in the highlight color (enlarged by the size
    multiplier) as it is spoken, based on transcription timestamps. Requires a
    word-level transcription, which the runner supplies via the
    ``requires=("transcription",)`` declaration -- re-based onto the segment's
    local timeline and delivered at plan-compile time through
    :class:`FilterCtx`; the op compiles to a libass ``subtitles=`` filter
    (:attr:`compiles_to_filter`), so subtitled edits run on the O(1)-memory
    streaming path at native speed.
    """

    op: Literal["add_subtitles"] = "add_subtitles"
    requires: ClassVar[tuple[str, ...]] = ("transcription",)

    # ---- primary, resolution-independent surface ----
    style: SubtitleStyle = Field(
        SubtitleStyle.BOXED,
        description='Look preset bundling colors/border/background/highlight: "boxed", "outline", "clean", "karaoke".',
    )
    region: SubtitleRegion = Field(
        SubtitleRegion.BOTTOM,
        description='Vertical placement band: "top", "center", or "bottom" of the frame.',
    )
    font_scale: float = Field(
        0.055,
        gt=0.0,
        le=0.5,
        description=(
            "Base font height as a fraction of frame height (resolution-independent; the recommended "
            "way to size subtitles). Long cues wrap within the box."
        ),
    )
    max_words_per_cue: int | None = Field(
        5,
        ge=1,
        description=(
            "Maximum words shown on screen at once. Each transcription segment is re-chunked into "
            "cues of at most this many words, without bridging the silence gaps between segments, so "
            "subtitles stay readable and don't linger over pauses. None preserves the source "
            "transcription's segmentation."
        ),
    )
    capitalize: bool = Field(
        True,
        description=(
            "Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
            "Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
            "exactly as transcribed."
        ),
    )
    font: Literal["anton", "bebas-neue", "lato-bold", "poppins-bold"] | None = Field(
        None,
        description=(
            "Bundled font for subtitles, or null for the default. "
            "'poppins-bold': clean geometric sans, general purpose. "
            "'lato-bold': humanist sans, very readable. "
            "'anton': tall condensed display, ideal for short-form vertical. "
            "'bebas-neue': bold condensed display, dramatic alternative."
        ),
    )
    font_filename: str | None = Field(
        None,
        description=(
            "Advanced override: path to a .ttf font file for subtitle text. Takes precedence over `font`; "
            "None for the bundled default font."
        ),
        json_schema_extra={"llm_hidden": True},
    )
    # ---- advanced overrides: None => derive from style/region/font_scale ----
    font_size: int | None = Field(
        None,
        ge=1,
        description=(
            "Advanced override: absolute base font size in pixels. Leave None to derive from "
            "`font_scale` (recommended -- resolution-independent)."
        ),
    )
    font_border_size: int | None = Field(
        None, ge=0, description="Advanced override for outline thickness in px. None takes it from `style`."
    )
    text_color: RGBColor | None = Field(
        None, description="Advanced override for default text color [R, G, B] (0-255). None takes it from `style`."
    )
    background_color: RGBAColor | None | Literal["auto"] = Field(
        _AUTO,
        description=(
            'Advanced override for the box background [R, G, B, A] (0-255). "auto" takes it from `style`; '
            "null explicitly disables the background."
        ),
    )
    background_padding: int | None = Field(
        None, ge=0, description="Advanced override: px between text and background edge. None takes it from `style`."
    )
    highlight_color: RGBColor | None = Field(
        None, description="Advanced override for the spoken-word color [R, G, B]. None takes it from `style`."
    )
    highlight_size_multiplier: float | None = Field(
        None, gt=0, description="Advanced override: scale factor for the highlighted word. None takes it from `style`."
    )
    position: tuple[float, float] | None = Field(
        None,
        description="Advanced override: box center as normalized (x, y). None derives it from `region`.",
    )
    box_width: float | None = Field(
        None,
        gt=0.0,
        le=1.0,
        description="Advanced override: box width as a fraction of frame width in (0, 1]. None uses 0.6.",
    )
    anchor: AnchorPoint | None = Field(
        None, description="Advanced override: which point of the box sits at the position. None uses center."
    )

    # ------------------------------------------------------------- resolution

    @property
    def compiles_to_filter(self) -> bool:
        return True

    def _style_params(self) -> _StyleParams:
        """Effective look: the ``style`` preset overlaid by any explicit overrides."""
        p = _STYLE_PRESETS[self.style]
        bg = p.background_color if self.background_color == _AUTO else self.background_color
        return _StyleParams(
            text_color=self.text_color or p.text_color,
            highlight_color=self.highlight_color or p.highlight_color,
            border=self.font_border_size if self.font_border_size is not None else p.border,
            background_color=bg,
            background_padding=(
                self.background_padding if self.background_padding is not None else p.background_padding
            ),
            highlight_size_multiplier=(
                self.highlight_size_multiplier
                if self.highlight_size_multiplier is not None
                else p.highlight_size_multiplier
            ),
        )

    def _transform(self, transcription: Transcription) -> Transcription:
        """Apply the cue transforms every render path MUST share."""
        if self.max_words_per_cue is not None:
            transcription = transcription.chunk_segments(self.max_words_per_cue)
        if self.capitalize:
            transcription = transcription.capitalize_sentences()
        return transcription

    def _ass_font(self) -> tuple[str, bool, Path]:
        """``(family, bold, fontsdir)`` for libass font matching.

        libass matches by the family name inside the font file, not the
        filename, so a ``font_filename`` override is probed for its name-table
        family (falling back to the bundled default on an unreadable file --
        the same never-hard-fail policy as ``load_font``).
        """
        if self.font_filename:
            path = Path(self.font_filename)
            try:
                family, face = ImageFont.truetype(str(path), 16).getname()
                return family, "bold" in (face or "").lower(), path.parent
            except (OSError, ValueError):
                pass
        family, bold = BUNDLED_FONT_FAMILIES.get(self.font, BUNDLED_FONT_FAMILIES[None])
        return family, bold, bundled_fonts_dir()

    def _ass_look(self, height: int) -> AssLook:
        """Resolve every override-or-preset field into the ASS look."""
        sp = self._style_params()
        font_px = self.font_size if self.font_size is not None else max(1, round(self.font_scale * height))
        # libass interprets Fontsize as the GDI cell height (ascender +
        # descender) while PIL sizes the em square; scale by the font's
        # cell/em ratio so font_scale keeps its historical apparent size
        # (1.5x divergence for a tall display font like Anton without this).
        metrics_font = load_font(self.font_filename or self.font, 100)
        if isinstance(metrics_font, ImageFont.FreeTypeFont):
            ascent, descent = metrics_font.getmetrics()
            font_px = max(1, round(font_px * (ascent + descent) / 100))
        family, bold, _ = self._ass_font()
        return AssLook(
            font_family=family,
            bold=bold,
            font_px=font_px,
            text_color=sp.text_color,
            highlight_color=sp.highlight_color,
            outline_px=sp.border,
            background=sp.background_color,
            background_padding=sp.background_padding,
            highlight_size_multiplier=sp.highlight_size_multiplier,
            position=self.position if self.position is not None else _REGION_POSITION[self.region],
            anchor=self.anchor if self.anchor is not None else AnchorPoint.CENTER,
            box_width=self.box_width if self.box_width is not None else 0.6,
        )

    def _compile_ass(self, transcription: Transcription, width: int, height: int) -> str:
        """The full ASS document for ``transcription`` at the given frame size.

        ``transcription`` timestamps must be local to the timeline the frames
        come from (the plan builder re-bases context onto the cut segment). The
        ``window`` is applied by clipping event times -- the ``subtitles``
        filter has no timeline support.
        """
        window = (self.window.start, self.window.stop) if self.window is not None else None
        return build_ass(
            self._transform(transcription),
            width=width,
            height=height,
            look=self._ass_look(height),
            window=window,
        )

    def _write_ass(self, document: str) -> Path:
        tmp = tempfile.NamedTemporaryFile("w", suffix=".ass", delete=False, encoding="utf-8")
        try:
            tmp.write(document)
        finally:
            tmp.close()
        return Path(tmp.name)

    def _filter_expr(self, ass_path: Path) -> str:
        _, _, fonts_dir = self._ass_font()
        return f"subtitles=filename={escape_filter_value(str(ass_path))}:fontsdir={escape_filter_value(str(fonts_dir))}"

    # ------------------------------------------------------------- execution

    def to_ffmpeg_filter(self, ctx: FilterCtx) -> str | None:
        """Compile to a libass ``subtitles=`` filter entry.

        Consumes the segment-local transcription from ``ctx.context`` at plan
        compile time: writes a temp ``.ass`` (registered on ``ctx.owned_files``
        for the runner to delete after streaming) and emits one ``-vf`` entry.
        A missing transcription raises the op's clear context error here --
        before any decode.
        """
        transcription = ctx.context.get("transcription")
        if not isinstance(transcription, Transcription):
            raise ValueError(_MISSING_CONTEXT_ERROR)
        ass_path = self._write_ass(self._compile_ass(transcription, ctx.width, ctx.height))
        ctx.owned_files.append(ass_path)
        return self._filter_expr(ass_path)

to_ffmpeg_filter

to_ffmpeg_filter(ctx: FilterCtx) -> str | None

Compile to a libass subtitles= filter entry.

Consumes the segment-local transcription from ctx.context at plan compile time: writes a temp .ass (registered on ctx.owned_files for the runner to delete after streaming) and emits one -vf entry. A missing transcription raises the op's clear context error here -- before any decode.

Source code in src/videopython/editing/transcription_overlay.py

def to_ffmpeg_filter(self, ctx: FilterCtx) -> str | None:
    """Compile to a libass ``subtitles=`` filter entry.

    Consumes the segment-local transcription from ``ctx.context`` at plan
    compile time: writes a temp ``.ass`` (registered on ``ctx.owned_files``
    for the runner to delete after streaming) and emits one ``-vf`` entry.
    A missing transcription raises the op's clear context error here --
    before any decode.
    """
    transcription = ctx.context.get("transcription")
    if not isinstance(transcription, Transcription):
        raise ValueError(_MISSING_CONTEXT_ERROR)
    ass_path = self._write_ass(self._compile_ass(transcription, ctx.width, ctx.height))
    ctx.owned_files.append(ass_path)
    return self._filter_expr(ass_path)

AnchorPoint

Bases: str, Enum

Which point of the subtitle box sits at the configured position.

Source code in src/videopython/editing/_ass.py

class AnchorPoint(str, Enum):
    """Which point of the subtitle box sits at the configured position."""

    TOP_LEFT = "top-left"
    TOP_CENTER = "top-center"
    TOP_RIGHT = "top-right"
    CENTER_LEFT = "center-left"
    CENTER = "center"
    CENTER_RIGHT = "center-right"
    BOTTOM_LEFT = "bottom-left"
    BOTTOM_CENTER = "bottom-center"
    BOTTOM_RIGHT = "bottom-right"

Text & Transcription

Transcription Classes

Transcription

Transcription

words property

__init__

speaker_stats

offset

standardize_segments

capitalize_sentences

chunk_segments

slice

to_srt

from_srt classmethod

save_srt

to_dict

from_dict classmethod

TranscriptionSegment

TranscriptionSegment dataclass

to_dict

from_dict classmethod

from_words classmethod

TranscriptionWord

TranscriptionWord dataclass

to_dict

from_dict classmethod

Overlay Classes

TranscriptionOverlay

TranscriptionOverlay

to_ffmpeg_filter

AnchorPoint

AnchorPoint

words `property`

init

from_srt `classmethod`

from_dict `classmethod`

TranscriptionSegment `dataclass`

from_dict `classmethod`

from_words `classmethod`

TranscriptionWord `dataclass`

from_dict `classmethod`