Skip to content

AI Generation

Generate videos, images, audio, and music from text prompts.

Local Model Support

Class Local Model Family
TextToVideo CogVideoX1.5-5B
ImageToVideo CogVideoX1.5-5B-I2V
TextToSpeech Bark / XTTS pipeline
TextToMusic MusicGen
TextToImage SDXL

TextToVideo

TextToVideo

Generates videos from text descriptions using local diffusion models.

Source code in src/videopython/ai/generation/video.py
class TextToVideo:
    """Generates videos from text descriptions using local diffusion models."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None
        self._device: str | None = None

    def _init_local(self) -> None:
        from diffusers import CogVideoXPipeline

        self._device, dtype = _get_torch_device_and_dtype(self.device)

        model_name = "THUDM/CogVideoX1.5-5B"
        self._pipeline = CogVideoXPipeline.from_pretrained(model_name, torch_dtype=dtype)
        self._pipeline.to(self._device)
        self.device = self._device

    def generate_video(
        self,
        prompt: str,
        num_steps: int = 50,
        num_frames: int = 81,
        guidance_scale: float = 6.0,
    ) -> Video:
        """Generate video from text prompt."""
        import torch

        if self._pipeline is None:
            self._init_local()

        video_frames = self._pipeline(
            prompt=prompt,
            num_inference_steps=num_steps,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            generator=torch.Generator(device=self._device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

generate_video

generate_video(
    prompt: str,
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video

Generate video from text prompt.

Source code in src/videopython/ai/generation/video.py
def generate_video(
    self,
    prompt: str,
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video:
    """Generate video from text prompt."""
    import torch

    if self._pipeline is None:
        self._init_local()

    video_frames = self._pipeline(
        prompt=prompt,
        num_inference_steps=num_steps,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        generator=torch.Generator(device=self._device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

ImageToVideo

ImageToVideo

Generates videos from static images using local video diffusion.

Source code in src/videopython/ai/generation/video.py
class ImageToVideo:
    """Generates videos from static images using local video diffusion."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None
        self._device: str | None = None

    def _init_local(self) -> None:
        from diffusers import CogVideoXImageToVideoPipeline

        self._device, dtype = _get_torch_device_and_dtype(self.device)

        model_name = "THUDM/CogVideoX1.5-5B-I2V"
        self._pipeline = CogVideoXImageToVideoPipeline.from_pretrained(model_name, torch_dtype=dtype)
        self._pipeline.to(self._device)
        self.device = self._device

    def generate_video(
        self,
        image: Image,
        prompt: str = "",
        num_steps: int = 50,
        num_frames: int = 81,
        guidance_scale: float = 6.0,
    ) -> Video:
        """Generate video animation from a static image."""
        import torch

        if self._pipeline is None:
            self._init_local()

        video_frames = self._pipeline(
            prompt=prompt,
            image=image,
            num_inference_steps=num_steps,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            generator=torch.Generator(device=self._device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

generate_video

generate_video(
    image: Image,
    prompt: str = "",
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video

Generate video animation from a static image.

Source code in src/videopython/ai/generation/video.py
def generate_video(
    self,
    image: Image,
    prompt: str = "",
    num_steps: int = 50,
    num_frames: int = 81,
    guidance_scale: float = 6.0,
) -> Video:
    """Generate video animation from a static image."""
    import torch

    if self._pipeline is None:
        self._init_local()

    video_frames = self._pipeline(
        prompt=prompt,
        image=image,
        num_inference_steps=num_steps,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        generator=torch.Generator(device=self._device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

TextToImage

TextToImage

Generates images from text descriptions using local models.

Source code in src/videopython/ai/generation/image.py
class TextToImage:
    """Generates images from text descriptions using local models."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        """Initialize local diffusion pipeline."""
        import torch
        from diffusers import DiffusionPipeline

        device = select_device(self.device, mps_allowed=True)
        dtype = torch.float16 if device == "cuda" else torch.float32
        variant = "fp16" if device == "cuda" else None

        model_name = "stabilityai/stable-diffusion-xl-base-1.0"
        self._pipeline = DiffusionPipeline.from_pretrained(
            model_name,
            torch_dtype=dtype,
            variant=variant,
            use_safetensors=True,
        )
        self._pipeline.to(device)
        self.device = device

        if device == "mps":
            self._pipeline.enable_attention_slicing()

    def generate_image(self, prompt: str) -> Image.Image:
        """Generate an image from a text prompt."""
        if self._pipeline is None:
            self._init_local()
        return self._pipeline(prompt=prompt).images[0]

generate_image

generate_image(prompt: str) -> Image.Image

Generate an image from a text prompt.

Source code in src/videopython/ai/generation/image.py
def generate_image(self, prompt: str) -> Image.Image:
    """Generate an image from a text prompt."""
    if self._pipeline is None:
        self._init_local()
    return self._pipeline(prompt=prompt).images[0]

TextToSpeech

TextToSpeech

Generates speech audio from text using local models.

Supports Bark (base, small) and XTTS voice cloning (xtts).

Source code in src/videopython/ai/generation/audio.py
class TextToSpeech:
    """Generates speech audio from text using local models.

    Supports Bark (`base`, `small`) and XTTS voice cloning (`xtts`).
    """

    SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "xtts"]

    def __init__(
        self,
        model_size: str = "base",
        voice: str | None = None,
        device: str | None = None,
        language: str = "en",
    ):
        if model_size not in self.SUPPORTED_LOCAL_MODELS:
            raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")

        self.model_size = model_size
        self.voice = voice
        self.device = device
        self.language = language
        self._model: Any = None
        self._processor: Any = None
        self._xtts_model: Any = None

    def _init_local(self) -> None:
        """Initialize local Bark model."""
        from transformers import AutoModel, AutoProcessor

        device = select_device(self.device, mps_allowed=False)

        model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
        self._processor = AutoProcessor.from_pretrained(model_name)
        self._model = AutoModel.from_pretrained(model_name).to(device)
        self.device = device

    def _patch_xtts_load_audio(self) -> None:
        """Patch XTTS load_audio to avoid torchcodec dependency issues."""
        import TTS.tts.models.xtts as xtts_module

        def load_audio_soundfile(audiopath: str, sampling_rate: int):
            import soundfile as sf  # type: ignore[import-untyped]
            import torch
            import torchaudio.functional as F  # type: ignore[import-untyped]

            audio_np, sr = sf.read(audiopath, dtype="float32")

            audio = torch.from_numpy(audio_np)
            if audio.dim() == 1:
                audio = audio.unsqueeze(0)
            else:
                audio = audio.T

            if audio.size(0) != 1:
                audio = torch.mean(audio, dim=0, keepdim=True)

            if sr != sampling_rate:
                audio = F.resample(audio, sr, sampling_rate)

            return audio

        xtts_module.load_audio = load_audio_soundfile

    def _init_xtts(self) -> None:
        """Initialize XTTS-v2 model for voice cloning."""
        from TTS.api import TTS

        self._patch_xtts_load_audio()

        device = select_device(self.device, mps_allowed=False)

        self._xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
        self.device = device

    def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
        """Generate speech using Bark."""
        import torch

        if self._model is None:
            self._init_local()

        inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
        inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        with torch.no_grad():
            speech_values = self._model.generate(**inputs, do_sample=True)

        audio_data = speech_values.cpu().float().numpy().squeeze()
        sample_rate = self._model.generation_config.sample_rate

        metadata = AudioMetadata(
            sample_rate=sample_rate,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / sample_rate,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)

    def _generate_xtts(self, text: str, voice_sample: Audio) -> Audio:
        """Generate speech using XTTS-v2 with voice cloning."""
        import tempfile
        from pathlib import Path

        import numpy as np

        if self._xtts_model is None:
            self._init_xtts()

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            voice_sample.save(f.name)
            speaker_wav_path = Path(f.name)

        try:
            audio_list = self._xtts_model.tts(
                text=text,
                speaker_wav=str(speaker_wav_path),
                language=self.language,
            )
            audio_data = np.array(audio_list, dtype=np.float32)
            sample_rate = 24000

            metadata = AudioMetadata(
                sample_rate=sample_rate,
                channels=1,
                sample_width=2,
                duration_seconds=len(audio_data) / sample_rate,
                frame_count=len(audio_data),
            )
            return Audio(audio_data, metadata)
        finally:
            speaker_wav_path.unlink()

    def generate_audio(
        self,
        text: str,
        voice_preset: str | None = None,
        voice_sample: Audio | None = None,
    ) -> Audio:
        """Generate speech audio from text."""
        effective_voice = voice_preset or self.voice

        if self.model_size == "xtts" or voice_sample is not None:
            if voice_sample is None:
                raise ValueError(
                    "voice_sample is required for XTTS voice cloning. Provide an Audio sample of the voice to clone."
                )
            return self._generate_xtts(text, voice_sample)

        return self._generate_local(text, effective_voice)

generate_audio

generate_audio(
    text: str,
    voice_preset: str | None = None,
    voice_sample: Audio | None = None,
) -> Audio

Generate speech audio from text.

Source code in src/videopython/ai/generation/audio.py
def generate_audio(
    self,
    text: str,
    voice_preset: str | None = None,
    voice_sample: Audio | None = None,
) -> Audio:
    """Generate speech audio from text."""
    effective_voice = voice_preset or self.voice

    if self.model_size == "xtts" or voice_sample is not None:
        if voice_sample is None:
            raise ValueError(
                "voice_sample is required for XTTS voice cloning. Provide an Audio sample of the voice to clone."
            )
        return self._generate_xtts(text, voice_sample)

    return self._generate_local(text, effective_voice)

TextToMusic

TextToMusic

Generates music from text descriptions using MusicGen.

Source code in src/videopython/ai/generation/audio.py
class TextToMusic:
    """Generates music from text descriptions using MusicGen."""

    def __init__(self, device: str | None = None):
        self.device = device
        self._processor: Any = None
        self._model: Any = None
        self._device: str | None = None

    def _init_local(self) -> None:
        """Initialize local MusicGen model."""
        import os

        from transformers import AutoProcessor, MusicgenForConditionalGeneration

        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

        self._device = select_device(self.device, mps_allowed=True)

        model_name = "facebook/musicgen-small"
        self._processor = AutoProcessor.from_pretrained(model_name)
        self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
        self._model.to(self._device)

    def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
        """Generate music audio from text description."""
        if self._model is None:
            self._init_local()

        inputs = self._processor(text=[text], padding=True, return_tensors="pt")
        inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
        audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
        sampling_rate = self._model.config.audio_encoder.sampling_rate

        audio_data = audio_values[0, 0].cpu().float().numpy()

        metadata = AudioMetadata(
            sample_rate=sampling_rate,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / sampling_rate,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)

generate_audio

generate_audio(
    text: str, max_new_tokens: int = 256
) -> Audio

Generate music audio from text description.

Source code in src/videopython/ai/generation/audio.py
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
    """Generate music audio from text description."""
    if self._model is None:
        self._init_local()

    inputs = self._processor(text=[text], padding=True, return_tensors="pt")
    inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
    audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
    sampling_rate = self._model.config.audio_encoder.sampling_rate

    audio_data = audio_values[0, 0].cpu().float().numpy()

    metadata = AudioMetadata(
        sample_rate=sampling_rate,
        channels=1,
        sample_width=2,
        duration_seconds=len(audio_data) / sampling_rate,
        frame_count=len(audio_data),
    )
    return Audio(audio_data, metadata)