Skip to content

AI Generation

Generate videos, images, audio, and music from text prompts.

Local Model Support

Class Local Model Family
TextToVideo CogVideoX1.5-5B
ImageToVideo CogVideoX1.5-5B-I2V
TextToSpeech Chatterbox Multilingual
TextToMusic MusicGen
TextToImage SDXL

TextToVideo

TextToVideo

Generates videos from text descriptions using local diffusion models.

Source code in src/videopython/ai/generation/video.py
class TextToVideo:
    """Generates videos from text descriptions using local diffusion models."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        from diffusers import CogVideoXPipeline

        requested_device = self.device
        device, dtype = _get_torch_device_and_dtype(self.device)

        model_name = "THUDM/CogVideoX1.5-5B"
        self._pipeline = CogVideoXPipeline.from_pretrained(model_name, torch_dtype=dtype)
        self._pipeline.to(device)
        self.device = device
        log_device_initialization(
            "TextToVideo",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_video(
        self,
        prompt: str,
        num_steps: int = 50,
        num_frames: int = 81,
        guidance_scale: float = 6.0,
    ) -> Video:
        """Generate video from text prompt."""
        import torch

        if self._pipeline is None:
            self._init_local()

        video_frames = self._pipeline(
            prompt=prompt,
            num_inference_steps=num_steps,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            generator=torch.Generator(device=self.device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

    def unload(self) -> None:
        """Release the diffusion pipeline so the next generate_video() re-initializes."""
        self._pipeline = None
        release_device_memory(self.device)

generate_video

generate_video(
    prompt: str,
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video

Generate video from text prompt.

Source code in src/videopython/ai/generation/video.py
def generate_video(
    self,
    prompt: str,
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video:
    """Generate video from text prompt."""
    import torch

    if self._pipeline is None:
        self._init_local()

    video_frames = self._pipeline(
        prompt=prompt,
        num_inference_steps=num_steps,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        generator=torch.Generator(device=self.device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

unload

unload() -> None

Release the diffusion pipeline so the next generate_video() re-initializes.

Source code in src/videopython/ai/generation/video.py
def unload(self) -> None:
    """Release the diffusion pipeline so the next generate_video() re-initializes."""
    self._pipeline = None
    release_device_memory(self.device)

ImageToVideo

ImageToVideo

Generates videos from static images using local video diffusion.

Source code in src/videopython/ai/generation/video.py
class ImageToVideo:
    """Generates videos from static images using local video diffusion."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        from diffusers import CogVideoXImageToVideoPipeline

        requested_device = self.device
        device, dtype = _get_torch_device_and_dtype(self.device)

        model_name = "THUDM/CogVideoX1.5-5B-I2V"
        self._pipeline = CogVideoXImageToVideoPipeline.from_pretrained(model_name, torch_dtype=dtype)
        self._pipeline.to(device)
        self.device = device
        log_device_initialization(
            "ImageToVideo",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_video(
        self,
        image: Image,
        prompt: str = "",
        num_steps: int = 50,
        num_frames: int = 81,
        guidance_scale: float = 6.0,
    ) -> Video:
        """Generate video animation from a static image."""
        import torch

        if self._pipeline is None:
            self._init_local()

        video_frames = self._pipeline(
            prompt=prompt,
            image=image,
            num_inference_steps=num_steps,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            generator=torch.Generator(device=self.device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

    def unload(self) -> None:
        """Release the diffusion pipeline so the next generate_video() re-initializes."""
        self._pipeline = None
        release_device_memory(self.device)

generate_video

generate_video(
    image: Image,
    prompt: str = "",
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video

Generate video animation from a static image.

Source code in src/videopython/ai/generation/video.py
def generate_video(
    self,
    image: Image,
    prompt: str = "",
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video:
    """Generate video animation from a static image."""
    import torch

    if self._pipeline is None:
        self._init_local()

    video_frames = self._pipeline(
        prompt=prompt,
        image=image,
        num_inference_steps=num_steps,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        generator=torch.Generator(device=self.device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

unload

unload() -> None

Release the diffusion pipeline so the next generate_video() re-initializes.

Source code in src/videopython/ai/generation/video.py
def unload(self) -> None:
    """Release the diffusion pipeline so the next generate_video() re-initializes."""
    self._pipeline = None
    release_device_memory(self.device)

TextToImage

TextToImage

Generates images from text descriptions using local models.

Source code in src/videopython/ai/generation/image.py
class TextToImage:
    """Generates images from text descriptions using local models."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        """Initialize local diffusion pipeline."""
        import torch
        from diffusers import DiffusionPipeline

        requested_device = self.device
        device = select_device(self.device, mps_allowed=True)
        dtype = torch.float16 if device == "cuda" else torch.float32
        variant = "fp16" if device == "cuda" else None

        model_name = "stabilityai/stable-diffusion-xl-base-1.0"
        self._pipeline = DiffusionPipeline.from_pretrained(
            model_name,
            torch_dtype=dtype,
            variant=variant,
            use_safetensors=True,
        )
        self._pipeline.to(device)
        self.device = device
        log_device_initialization(
            "TextToImage",
            requested_device=requested_device,
            resolved_device=device,
        )

        if device == "mps":
            self._pipeline.enable_attention_slicing()

    def generate_image(self, prompt: str) -> Image.Image:
        """Generate an image from a text prompt."""
        if self._pipeline is None:
            self._init_local()
        return self._pipeline(prompt=prompt).images[0]

    def unload(self) -> None:
        """Release the diffusion pipeline so the next generate_image() re-initializes."""
        self._pipeline = None
        release_device_memory(self.device)

generate_image

generate_image(prompt: str) -> Image.Image

Generate an image from a text prompt.

Source code in src/videopython/ai/generation/image.py
def generate_image(self, prompt: str) -> Image.Image:
    """Generate an image from a text prompt."""
    if self._pipeline is None:
        self._init_local()
    return self._pipeline(prompt=prompt).images[0]

unload

unload() -> None

Release the diffusion pipeline so the next generate_image() re-initializes.

Source code in src/videopython/ai/generation/image.py
def unload(self) -> None:
    """Release the diffusion pipeline so the next generate_image() re-initializes."""
    self._pipeline = None
    release_device_memory(self.device)

TextToSpeech

generate_audio accepts three optional Chatterbox generate() knobs — exaggeration, cfg_weight, and temperature — for callers who want to shape per-utterance prosody. Each defaults to None, which means "don't pass the kwarg, let Chatterbox use its default". The dubbing pipeline derives them per-segment from source vocals RMS via the Expressiveness dataclass.

from videopython.ai import TextToSpeech

tts = TextToSpeech()

# Chatterbox defaults.
audio = tts.generate_audio("Welcome to videopython.")

# Dramatic delivery (higher exaggeration, lower cfg_weight slows pacing).
dramatic = tts.generate_audio(
    "We made it.",
    exaggeration=0.85,
    cfg_weight=0.35,
)

TextToSpeech

Generates speech audio from text using Chatterbox Multilingual.

Backed by Chatterbox Multilingual (Resemble AI). When voice_sample is provided to generate_audio, the model clones that voice; otherwise it falls back to Chatterbox's built-in default speaker.

Source code in src/videopython/ai/generation/audio.py
class TextToSpeech:
    """Generates speech audio from text using Chatterbox Multilingual.

    Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
    provided to ``generate_audio``, the model clones that voice; otherwise it
    falls back to Chatterbox's built-in default speaker.
    """

    SAMPLE_RATE: int = 24000

    def __init__(
        self,
        voice: Audio | None = None,
        device: str | None = None,
        language: str = "en",
    ):
        self.voice = voice
        self.device = device
        self.language = language
        self._model: Any = None

    def _init_local(self) -> None:
        from chatterbox.mtl_tts import ChatterboxMultilingualTTS

        requested_device = self.device
        device = select_device(self.device, mps_allowed=False)

        self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
        self.device = device
        log_device_initialization(
            "TextToSpeech",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_audio(
        self,
        text: str,
        voice_sample: Audio | None = None,
        voice_sample_path: str | Path | None = None,
        exaggeration: float | None = None,
        cfg_weight: float | None = None,
        temperature: float | None = None,
    ) -> Audio:
        """Generate speech audio from text.

        Args:
            text: Text to synthesize.
            voice_sample: Optional voice sample to clone. Falls back to the
                instance's ``voice`` and then to Chatterbox's default speaker.
            voice_sample_path: Optional pre-encoded WAV path to use directly as
                the speaker prompt. Skips the per-call temp-WAV encode that
                ``voice_sample`` would otherwise trigger. When set, takes
                precedence over ``voice_sample`` and ``self.voice``. Used by
                the dubbing pipeline to encode each speaker's sample once and
                reuse it across all of that speaker's segments.
            exaggeration: Chatterbox emotional-intensity knob (default
                ``0.5``). ``None`` (default) means do not pass the kwarg —
                Chatterbox uses its own default and we stay forward-compatible
                with changes to it. ``0.7+`` produces dramatic output.
            cfg_weight: Chatterbox classifier-free-guidance weight (default
                ``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
                slow pacing.
            temperature: Chatterbox sampling temperature (default ``0.8``).
                ``None`` means do not pass.
        """
        import tempfile
        from pathlib import Path

        import numpy as np

        if self._model is None:
            self._init_local()

        speaker_wav_path: Path | None = None
        cleanup_path = False

        if voice_sample_path is not None:
            speaker_wav_path = Path(voice_sample_path)
        else:
            effective_sample = voice_sample or self.voice
            if effective_sample is not None:
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                    effective_sample.save(f.name)
                    speaker_wav_path = Path(f.name)
                    cleanup_path = True

        # Only forward knobs the caller explicitly set. Passing nothing
        # for a knob lets Chatterbox use its own default — important so a
        # future Chatterbox default change doesn't get pinned by us.
        knobs: dict[str, float] = {}
        if exaggeration is not None:
            knobs["exaggeration"] = exaggeration
        if cfg_weight is not None:
            knobs["cfg_weight"] = cfg_weight
        if temperature is not None:
            knobs["temperature"] = temperature

        try:
            wav = self._model.generate(
                text=text,
                language_id=self.language,
                audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
                **knobs,
            )

            audio_data = wav.cpu().float().numpy().squeeze()
            if audio_data.ndim == 0:
                audio_data = np.array([audio_data], dtype=np.float32)

            metadata = AudioMetadata(
                sample_rate=self.SAMPLE_RATE,
                channels=1,
                sample_width=2,
                duration_seconds=len(audio_data) / self.SAMPLE_RATE,
                frame_count=len(audio_data),
            )
            return Audio(audio_data, metadata)
        finally:
            if cleanup_path and speaker_wav_path is not None:
                speaker_wav_path.unlink(missing_ok=True)

    def unload(self) -> None:
        """Release the TTS model so the next generate_audio() re-initializes.

        Used by low-memory dubbing to free VRAM between pipeline stages.
        """
        self._model = None
        release_device_memory(self.device)

generate_audio

generate_audio(
    text: str,
    voice_sample: Audio | None = None,
    voice_sample_path: str | Path | None = None,
    exaggeration: float | None = None,
    cfg_weight: float | None = None,
    temperature: float | None = None,
) -> Audio

Generate speech audio from text.

Parameters:

Name Type Description Default
text str

Text to synthesize.

required
voice_sample Audio | None

Optional voice sample to clone. Falls back to the instance's voice and then to Chatterbox's default speaker.

None
voice_sample_path str | Path | None

Optional pre-encoded WAV path to use directly as the speaker prompt. Skips the per-call temp-WAV encode that voice_sample would otherwise trigger. When set, takes precedence over voice_sample and self.voice. Used by the dubbing pipeline to encode each speaker's sample once and reuse it across all of that speaker's segments.

None
exaggeration float | None

Chatterbox emotional-intensity knob (default 0.5). None (default) means do not pass the kwarg — Chatterbox uses its own default and we stay forward-compatible with changes to it. 0.7+ produces dramatic output.

None
cfg_weight float | None

Chatterbox classifier-free-guidance weight (default 0.5). None means do not pass. Lower values (~0.3) slow pacing.

None
temperature float | None

Chatterbox sampling temperature (default 0.8). None means do not pass.

None
Source code in src/videopython/ai/generation/audio.py
def generate_audio(
    self,
    text: str,
    voice_sample: Audio | None = None,
    voice_sample_path: str | Path | None = None,
    exaggeration: float | None = None,
    cfg_weight: float | None = None,
    temperature: float | None = None,
) -> Audio:
    """Generate speech audio from text.

    Args:
        text: Text to synthesize.
        voice_sample: Optional voice sample to clone. Falls back to the
            instance's ``voice`` and then to Chatterbox's default speaker.
        voice_sample_path: Optional pre-encoded WAV path to use directly as
            the speaker prompt. Skips the per-call temp-WAV encode that
            ``voice_sample`` would otherwise trigger. When set, takes
            precedence over ``voice_sample`` and ``self.voice``. Used by
            the dubbing pipeline to encode each speaker's sample once and
            reuse it across all of that speaker's segments.
        exaggeration: Chatterbox emotional-intensity knob (default
            ``0.5``). ``None`` (default) means do not pass the kwarg —
            Chatterbox uses its own default and we stay forward-compatible
            with changes to it. ``0.7+`` produces dramatic output.
        cfg_weight: Chatterbox classifier-free-guidance weight (default
            ``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
            slow pacing.
        temperature: Chatterbox sampling temperature (default ``0.8``).
            ``None`` means do not pass.
    """
    import tempfile
    from pathlib import Path

    import numpy as np

    if self._model is None:
        self._init_local()

    speaker_wav_path: Path | None = None
    cleanup_path = False

    if voice_sample_path is not None:
        speaker_wav_path = Path(voice_sample_path)
    else:
        effective_sample = voice_sample or self.voice
        if effective_sample is not None:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                effective_sample.save(f.name)
                speaker_wav_path = Path(f.name)
                cleanup_path = True

    # Only forward knobs the caller explicitly set. Passing nothing
    # for a knob lets Chatterbox use its own default — important so a
    # future Chatterbox default change doesn't get pinned by us.
    knobs: dict[str, float] = {}
    if exaggeration is not None:
        knobs["exaggeration"] = exaggeration
    if cfg_weight is not None:
        knobs["cfg_weight"] = cfg_weight
    if temperature is not None:
        knobs["temperature"] = temperature

    try:
        wav = self._model.generate(
            text=text,
            language_id=self.language,
            audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
            **knobs,
        )

        audio_data = wav.cpu().float().numpy().squeeze()
        if audio_data.ndim == 0:
            audio_data = np.array([audio_data], dtype=np.float32)

        metadata = AudioMetadata(
            sample_rate=self.SAMPLE_RATE,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / self.SAMPLE_RATE,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)
    finally:
        if cleanup_path and speaker_wav_path is not None:
            speaker_wav_path.unlink(missing_ok=True)

unload

unload() -> None

Release the TTS model so the next generate_audio() re-initializes.

Used by low-memory dubbing to free VRAM between pipeline stages.

Source code in src/videopython/ai/generation/audio.py
def unload(self) -> None:
    """Release the TTS model so the next generate_audio() re-initializes.

    Used by low-memory dubbing to free VRAM between pipeline stages.
    """
    self._model = None
    release_device_memory(self.device)

TextToMusic

TextToMusic

Generates music from text descriptions using MusicGen.

Source code in src/videopython/ai/generation/audio.py
class TextToMusic:
    """Generates music from text descriptions using MusicGen."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._processor: Any = None
        self._model: Any = None

    def _init_local(self) -> None:
        """Initialize local MusicGen model."""
        import os

        from transformers import AutoProcessor, MusicgenForConditionalGeneration

        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

        requested_device = self.device
        device = select_device(self.device, mps_allowed=True)

        model_name = "facebook/musicgen-small"
        self._processor = AutoProcessor.from_pretrained(model_name)
        self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
        self._model.to(device)
        self.device = device
        log_device_initialization(
            "TextToMusic",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
        """Generate music audio from text description."""
        if self._model is None:
            self._init_local()

        inputs = self._processor(text=[text], padding=True, return_tensors="pt")
        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
        audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
        sampling_rate = self._model.config.audio_encoder.sampling_rate

        audio_data = audio_values[0, 0].cpu().float().numpy()

        metadata = AudioMetadata(
            sample_rate=sampling_rate,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / sampling_rate,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)

    def unload(self) -> None:
        """Release the MusicGen model so the next generate_audio() re-initializes."""
        self._model = None
        self._processor = None
        release_device_memory(self.device)

generate_audio

generate_audio(
    text: str, max_new_tokens: int = 256
) -> Audio

Generate music audio from text description.

Source code in src/videopython/ai/generation/audio.py
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
    """Generate music audio from text description."""
    if self._model is None:
        self._init_local()

    inputs = self._processor(text=[text], padding=True, return_tensors="pt")
    inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
    audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
    sampling_rate = self._model.config.audio_encoder.sampling_rate

    audio_data = audio_values[0, 0].cpu().float().numpy()

    metadata = AudioMetadata(
        sample_rate=sampling_rate,
        channels=1,
        sample_width=2,
        duration_seconds=len(audio_data) / sampling_rate,
        frame_count=len(audio_data),
    )
    return Audio(audio_data, metadata)

unload

unload() -> None

Release the MusicGen model so the next generate_audio() re-initializes.

Source code in src/videopython/ai/generation/audio.py
def unload(self) -> None:
    """Release the MusicGen model so the next generate_audio() re-initializes."""
    self._model = None
    self._processor = None
    release_device_memory(self.device)