Skip to content

AI Dubbing

Dub videos into different languages or replace speech with custom text using voice cloning.

Local Pipeline

Video dubbing runs with a local pipeline combining Whisper, translation models, XTTS, and Demucs.

VideoDubber

Main class for video dubbing and voice revoicing.

Basic Dubbing

Translate speech to another language while preserving the original speaker's voice:

from videopython.ai.dubbing import VideoDubber
from videopython.base import Video

video = Video.from_path("video.mp4")
dubber = VideoDubber()

# Dub to Spanish with voice cloning
result = dubber.dub(
    video=video,
    target_lang="es",
    source_lang="en",
    preserve_background=True,  # Keep music and sound effects
    voice_clone=True,          # Clone original speaker's voice
)

# Save dubbed video
dubbed_video = video.add_audio(result.dubbed_audio, overlay=False)
dubbed_video.save("dubbed_video.mp4")

# Or use convenience method
dubbed_video = dubber.dub_and_replace(video, target_lang="es")
dubbed_video.save("dubbed_video.mp4")

Voice Revoicing

Replace speech with completely different text using the original speaker's voice:

from videopython.ai.dubbing import VideoDubber
from videopython.base import Video

video = Video.from_path("video.mp4")
dubber = VideoDubber()

# Make the person say something different
result = dubber.revoice(
    video=video,
    text="Hello everyone! This is a completely different message.",
    preserve_background=True,
)

print(f"Original duration: {result.original_duration:.1f}s")
print(f"New speech duration: {result.speech_duration:.1f}s")

# Save revoiced video (trimmed to speech length)
revoiced_video = dubber.revoice_and_replace(
    video=video,
    text="Hello everyone! This is a completely different message.",
)
revoiced_video.save("revoiced_video.mp4")

Progress Tracking

def on_progress(stage: str, progress: float) -> None:
    print(f"[{progress*100:5.1f}%] {stage}")

result = dubber.dub(
    video=video,
    target_lang="es",
    progress_callback=on_progress,
)

VideoDubber

Dubs videos into different languages using the local pipeline.

Source code in src/videopython/ai/dubbing/dubber.py
class VideoDubber:
    """Dubs videos into different languages using the local pipeline."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._local_pipeline: Any = None

    def _init_local_pipeline(self) -> None:
        from videopython.ai.dubbing.pipeline import LocalDubbingPipeline

        self._local_pipeline = LocalDubbingPipeline(device=self.device)

    def dub(
        self,
        video: Video,
        target_lang: str,
        source_lang: str | None = None,
        preserve_background: bool = True,
        voice_clone: bool = True,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> DubbingResult:
        """Dub a video into a target language."""
        if self._local_pipeline is None:
            self._init_local_pipeline()

        return self._local_pipeline.process(
            video=video,
            target_lang=target_lang,
            source_lang=source_lang,
            preserve_background=preserve_background,
            voice_clone=voice_clone,
            progress_callback=progress_callback,
        )

    def dub_and_replace(
        self,
        video: Video,
        target_lang: str,
        source_lang: str | None = None,
        preserve_background: bool = True,
        voice_clone: bool = True,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> Video:
        """Dub a video and return a new video with the dubbed audio."""
        result = self.dub(
            video=video,
            target_lang=target_lang,
            source_lang=source_lang,
            preserve_background=preserve_background,
            voice_clone=voice_clone,
            progress_callback=progress_callback,
        )
        return video.add_audio(result.dubbed_audio, overlay=False)

    def revoice(
        self,
        video: Video,
        text: str,
        preserve_background: bool = True,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> RevoiceResult:
        """Replace speech in a video with new text using voice cloning."""
        if self._local_pipeline is None:
            self._init_local_pipeline()

        return self._local_pipeline.revoice(
            video=video,
            text=text,
            preserve_background=preserve_background,
            progress_callback=progress_callback,
        )

    def revoice_and_replace(
        self,
        video: Video,
        text: str,
        preserve_background: bool = True,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> Video:
        """Revoice a video and return a new video with the revoiced audio."""
        result = self.revoice(
            video=video,
            text=text,
            preserve_background=preserve_background,
            progress_callback=progress_callback,
        )

        speech_duration = result.speech_duration
        video_duration = video.total_seconds

        if video_duration > speech_duration:
            output_video = video.cut(0, speech_duration)
        else:
            output_video = video

        return output_video.add_audio(result.revoiced_audio, overlay=False)

    @staticmethod
    def get_supported_languages() -> dict[str, str]:
        from videopython.ai.generation.translation import TextTranslator

        return TextTranslator.get_supported_languages()

dub

dub(
    video: Video,
    target_lang: str,
    source_lang: str | None = None,
    preserve_background: bool = True,
    voice_clone: bool = True,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> DubbingResult

Dub a video into a target language.

Source code in src/videopython/ai/dubbing/dubber.py
def dub(
    self,
    video: Video,
    target_lang: str,
    source_lang: str | None = None,
    preserve_background: bool = True,
    voice_clone: bool = True,
    progress_callback: Callable[[str, float], None] | None = None,
) -> DubbingResult:
    """Dub a video into a target language."""
    if self._local_pipeline is None:
        self._init_local_pipeline()

    return self._local_pipeline.process(
        video=video,
        target_lang=target_lang,
        source_lang=source_lang,
        preserve_background=preserve_background,
        voice_clone=voice_clone,
        progress_callback=progress_callback,
    )

dub_and_replace

dub_and_replace(
    video: Video,
    target_lang: str,
    source_lang: str | None = None,
    preserve_background: bool = True,
    voice_clone: bool = True,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> Video

Dub a video and return a new video with the dubbed audio.

Source code in src/videopython/ai/dubbing/dubber.py
def dub_and_replace(
    self,
    video: Video,
    target_lang: str,
    source_lang: str | None = None,
    preserve_background: bool = True,
    voice_clone: bool = True,
    progress_callback: Callable[[str, float], None] | None = None,
) -> Video:
    """Dub a video and return a new video with the dubbed audio."""
    result = self.dub(
        video=video,
        target_lang=target_lang,
        source_lang=source_lang,
        preserve_background=preserve_background,
        voice_clone=voice_clone,
        progress_callback=progress_callback,
    )
    return video.add_audio(result.dubbed_audio, overlay=False)

revoice

revoice(
    video: Video,
    text: str,
    preserve_background: bool = True,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> RevoiceResult

Replace speech in a video with new text using voice cloning.

Source code in src/videopython/ai/dubbing/dubber.py
def revoice(
    self,
    video: Video,
    text: str,
    preserve_background: bool = True,
    progress_callback: Callable[[str, float], None] | None = None,
) -> RevoiceResult:
    """Replace speech in a video with new text using voice cloning."""
    if self._local_pipeline is None:
        self._init_local_pipeline()

    return self._local_pipeline.revoice(
        video=video,
        text=text,
        preserve_background=preserve_background,
        progress_callback=progress_callback,
    )

revoice_and_replace

revoice_and_replace(
    video: Video,
    text: str,
    preserve_background: bool = True,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> Video

Revoice a video and return a new video with the revoiced audio.

Source code in src/videopython/ai/dubbing/dubber.py
def revoice_and_replace(
    self,
    video: Video,
    text: str,
    preserve_background: bool = True,
    progress_callback: Callable[[str, float], None] | None = None,
) -> Video:
    """Revoice a video and return a new video with the revoiced audio."""
    result = self.revoice(
        video=video,
        text=text,
        preserve_background=preserve_background,
        progress_callback=progress_callback,
    )

    speech_duration = result.speech_duration
    video_duration = video.total_seconds

    if video_duration > speech_duration:
        output_video = video.cut(0, speech_duration)
    else:
        output_video = video

    return output_video.add_audio(result.revoiced_audio, overlay=False)

DubbingResult

Result of a dubbing operation containing the dubbed audio and metadata.

result = dubber.dub(video, target_lang="es")

print(f"Translated {result.num_segments} segments")
print(f"Source language: {result.source_lang}")
print(f"Target language: {result.target_lang}")

# Access translated segments
for segment in result.translated_segments:
    print(f"'{segment.original_text}' -> '{segment.translated_text}'")

# Access voice samples used for cloning
for speaker, sample in result.voice_samples.items():
    print(f"{speaker}: {sample.metadata.duration_seconds:.1f}s sample")

DubbingResult dataclass

Result of a video dubbing operation.

Attributes:

Name Type Description
dubbed_audio Audio

The final dubbed audio track.

translated_segments list[TranslatedSegment]

List of translated segments with timing.

source_transcription Transcription

Original transcription of the source audio.

source_lang str

Detected or specified source language.

target_lang str

Target language for dubbing.

separated_audio SeparatedAudio | None

Separated audio components (if preserve_background=True).

voice_samples dict[str, Audio]

Dictionary mapping speaker IDs to voice sample Audio.

Source code in src/videopython/ai/dubbing/models.py
@dataclass
class DubbingResult:
    """Result of a video dubbing operation.

    Attributes:
        dubbed_audio: The final dubbed audio track.
        translated_segments: List of translated segments with timing.
        source_transcription: Original transcription of the source audio.
        source_lang: Detected or specified source language.
        target_lang: Target language for dubbing.
        separated_audio: Separated audio components (if preserve_background=True).
        voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
    """

    dubbed_audio: Audio
    translated_segments: list[TranslatedSegment]
    source_transcription: Transcription
    source_lang: str
    target_lang: str
    separated_audio: SeparatedAudio | None = None
    voice_samples: dict[str, Audio] = field(default_factory=dict)

    @property
    def num_segments(self) -> int:
        """Number of translated segments."""
        return len(self.translated_segments)

    @property
    def total_duration(self) -> float:
        """Total duration of the dubbed audio."""
        return self.dubbed_audio.metadata.duration_seconds

    def get_segments_by_speaker(self) -> dict[str, list[TranslatedSegment]]:
        """Group translated segments by speaker.

        Returns:
            Dictionary mapping speaker IDs to their segments.
        """
        segments_by_speaker: dict[str, list[TranslatedSegment]] = {}
        for segment in self.translated_segments:
            speaker = segment.speaker or "unknown"
            if speaker not in segments_by_speaker:
                segments_by_speaker[speaker] = []
            segments_by_speaker[speaker].append(segment)
        return segments_by_speaker

num_segments property

num_segments: int

Number of translated segments.

total_duration property

total_duration: float

Total duration of the dubbed audio.

get_segments_by_speaker

get_segments_by_speaker() -> dict[
    str, list[TranslatedSegment]
]

Group translated segments by speaker.

Returns:

Type Description
dict[str, list[TranslatedSegment]]

Dictionary mapping speaker IDs to their segments.

Source code in src/videopython/ai/dubbing/models.py
def get_segments_by_speaker(self) -> dict[str, list[TranslatedSegment]]:
    """Group translated segments by speaker.

    Returns:
        Dictionary mapping speaker IDs to their segments.
    """
    segments_by_speaker: dict[str, list[TranslatedSegment]] = {}
    for segment in self.translated_segments:
        speaker = segment.speaker or "unknown"
        if speaker not in segments_by_speaker:
            segments_by_speaker[speaker] = []
        segments_by_speaker[speaker].append(segment)
    return segments_by_speaker

RevoiceResult

Result of a revoicing operation.

result = dubber.revoice(video, text="New message here")

print(f"Text: {result.text}")
print(f"Speech duration: {result.speech_duration:.1f}s")
print(f"Voice sample: {result.voice_sample.metadata.duration_seconds:.1f}s")

RevoiceResult dataclass

Result of a voice replacement operation.

Attributes:

Name Type Description
revoiced_audio Audio

The final audio with new speech.

text str

The text that was spoken.

separated_audio SeparatedAudio | None

Separated audio components (if preserve_background=True).

voice_sample Audio | None

Voice sample used for cloning.

original_duration float

Duration of the original audio.

speech_duration float

Duration of the generated speech.

Source code in src/videopython/ai/dubbing/models.py
@dataclass
class RevoiceResult:
    """Result of a voice replacement operation.

    Attributes:
        revoiced_audio: The final audio with new speech.
        text: The text that was spoken.
        separated_audio: Separated audio components (if preserve_background=True).
        voice_sample: Voice sample used for cloning.
        original_duration: Duration of the original audio.
        speech_duration: Duration of the generated speech.
    """

    revoiced_audio: Audio
    text: str
    separated_audio: SeparatedAudio | None = None
    voice_sample: Audio | None = None
    original_duration: float = 0.0
    speech_duration: float = 0.0

    @property
    def total_duration(self) -> float:
        """Total duration of the revoiced audio."""
        return self.revoiced_audio.metadata.duration_seconds

total_duration property

total_duration: float

Total duration of the revoiced audio.

TranslatedSegment

Individual translated speech segment with timing information.

TranslatedSegment dataclass

A segment of translated text with timing information.

Attributes:

Name Type Description
original_segment TranscriptionSegment

The original transcription segment.

translated_text str

The translated text.

source_lang str

Source language code (e.g., "en").

target_lang str

Target language code (e.g., "es").

speaker str | None

Speaker identifier if available.

start float

Start time in seconds.

end float

End time in seconds.

Source code in src/videopython/ai/dubbing/models.py
@dataclass
class TranslatedSegment:
    """A segment of translated text with timing information.

    Attributes:
        original_segment: The original transcription segment.
        translated_text: The translated text.
        source_lang: Source language code (e.g., "en").
        target_lang: Target language code (e.g., "es").
        speaker: Speaker identifier if available.
        start: Start time in seconds.
        end: End time in seconds.
    """

    original_segment: TranscriptionSegment
    translated_text: str
    source_lang: str
    target_lang: str
    speaker: str | None = None
    start: float = 0.0
    end: float = 0.0

    def __post_init__(self) -> None:
        """Set timing from original segment if not provided."""
        if self.start == 0.0 and self.end == 0.0:
            self.start = self.original_segment.start
            self.end = self.original_segment.end
        if self.speaker is None:
            self.speaker = self.original_segment.speaker

    @property
    def original_text(self) -> str:
        """Get the original text from the segment."""
        return self.original_segment.text

    @property
    def duration(self) -> float:
        """Duration of the segment in seconds."""
        return self.end - self.start

original_text property

original_text: str

Get the original text from the segment.

duration property

duration: float

Duration of the segment in seconds.

__post_init__

__post_init__() -> None

Set timing from original segment if not provided.

Source code in src/videopython/ai/dubbing/models.py
def __post_init__(self) -> None:
    """Set timing from original segment if not provided."""
    if self.start == 0.0 and self.end == 0.0:
        self.start = self.original_segment.start
        self.end = self.original_segment.end
    if self.speaker is None:
        self.speaker = self.original_segment.speaker

SeparatedAudio

Audio separated into vocals and background components.

SeparatedAudio dataclass

Audio separated into different components.

Attributes:

Name Type Description
vocals Audio

Isolated vocal/speech track.

background Audio

Combined background audio (music + effects).

music Audio | None

Isolated music track (if available).

effects Audio | None

Isolated sound effects track (if available).

original Audio

The original unseparated audio.

Source code in src/videopython/ai/dubbing/models.py
@dataclass
class SeparatedAudio:
    """Audio separated into different components.

    Attributes:
        vocals: Isolated vocal/speech track.
        background: Combined background audio (music + effects).
        music: Isolated music track (if available).
        effects: Isolated sound effects track (if available).
        original: The original unseparated audio.
    """

    vocals: Audio
    background: Audio
    original: Audio
    music: Audio | None = None
    effects: Audio | None = None

    @property
    def has_detailed_separation(self) -> bool:
        """Check if music and effects are separated."""
        return self.music is not None and self.effects is not None

has_detailed_separation property

has_detailed_separation: bool

Check if music and effects are separated.

Supported Languages

Get the list of supported languages:

languages = VideoDubber.get_supported_languages()
# {'en': 'English', 'es': 'Spanish', 'fr': 'French', ...}

Supported languages include: English, Spanish, French, German, Italian, Portuguese, Polish, Hindi, Arabic, Czech, Danish, Dutch, Finnish, Greek, Hebrew, Indonesian, Japanese, Korean, Malay, Norwegian, Romanian, Russian, Slovak, Swedish, Tamil, Thai, Turkish, Ukrainian, Vietnamese, Chinese.