AI Generation
Generate videos, images, audio, and music from text prompts.
Local Model Support
| Class |
Local Model Family |
| TextToVideo |
CogVideoX1.5-5B |
| ImageToVideo |
CogVideoX1.5-5B-I2V |
| TextToSpeech |
Chatterbox Multilingual |
| TextToMusic |
MusicGen |
| TextToImage |
SDXL |
TextToVideo
TextToVideo
Generates videos from text descriptions using local diffusion models.
Source code in src/videopython/ai/generation/video.py
| class TextToVideo:
"""Generates videos from text descriptions using local diffusion models."""
def __init__(self, device: str | None = None):
self.device = device
self._pipeline: Any = None
def _init_local(self) -> None:
from diffusers import CogVideoXPipeline
requested_device = self.device
device, dtype = _get_torch_device_and_dtype(self.device)
model_name = "THUDM/CogVideoX1.5-5B"
self._pipeline = CogVideoXPipeline.from_pretrained(model_name, torch_dtype=dtype)
self._pipeline.to(device)
self.device = device
log_device_initialization(
"TextToVideo",
requested_device=requested_device,
resolved_device=device,
)
def generate_video(
self,
prompt: str,
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video:
"""Generate video from text prompt."""
import torch
if self._pipeline is None:
self._init_local()
video_frames = self._pipeline(
prompt=prompt,
num_inference_steps=num_steps,
num_frames=num_frames,
guidance_scale=guidance_scale,
generator=torch.Generator(device=self.device).manual_seed(42),
).frames[0]
video_frames = np.asarray(video_frames, dtype=np.uint8)
return Video.from_frames(video_frames, fps=16.0)
def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
generate_video
generate_video(
prompt: str,
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video
Generate video from text prompt.
Source code in src/videopython/ai/generation/video.py
| def generate_video(
self,
prompt: str,
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video:
"""Generate video from text prompt."""
import torch
if self._pipeline is None:
self._init_local()
video_frames = self._pipeline(
prompt=prompt,
num_inference_steps=num_steps,
num_frames=num_frames,
guidance_scale=guidance_scale,
generator=torch.Generator(device=self.device).manual_seed(42),
).frames[0]
video_frames = np.asarray(video_frames, dtype=np.uint8)
return Video.from_frames(video_frames, fps=16.0)
|
unload
Release the diffusion pipeline so the next generate_video() re-initializes.
Source code in src/videopython/ai/generation/video.py
| def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
ImageToVideo
ImageToVideo
Generates videos from static images using local video diffusion.
Source code in src/videopython/ai/generation/video.py
| class ImageToVideo:
"""Generates videos from static images using local video diffusion."""
def __init__(self, device: str | None = None):
self.device = device
self._pipeline: Any = None
def _init_local(self) -> None:
from diffusers import CogVideoXImageToVideoPipeline
requested_device = self.device
device, dtype = _get_torch_device_and_dtype(self.device)
model_name = "THUDM/CogVideoX1.5-5B-I2V"
self._pipeline = CogVideoXImageToVideoPipeline.from_pretrained(model_name, torch_dtype=dtype)
self._pipeline.to(device)
self.device = device
log_device_initialization(
"ImageToVideo",
requested_device=requested_device,
resolved_device=device,
)
def generate_video(
self,
image: Image,
prompt: str = "",
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video:
"""Generate video animation from a static image."""
import torch
if self._pipeline is None:
self._init_local()
video_frames = self._pipeline(
prompt=prompt,
image=image,
num_inference_steps=num_steps,
num_frames=num_frames,
guidance_scale=guidance_scale,
generator=torch.Generator(device=self.device).manual_seed(42),
).frames[0]
video_frames = np.asarray(video_frames, dtype=np.uint8)
return Video.from_frames(video_frames, fps=16.0)
def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
generate_video
generate_video(
image: Image,
prompt: str = "",
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video
Generate video animation from a static image.
Source code in src/videopython/ai/generation/video.py
| def generate_video(
self,
image: Image,
prompt: str = "",
num_steps: int = 50,
num_frames: int = 81,
guidance_scale: float = 6.0,
) -> Video:
"""Generate video animation from a static image."""
import torch
if self._pipeline is None:
self._init_local()
video_frames = self._pipeline(
prompt=prompt,
image=image,
num_inference_steps=num_steps,
num_frames=num_frames,
guidance_scale=guidance_scale,
generator=torch.Generator(device=self.device).manual_seed(42),
).frames[0]
video_frames = np.asarray(video_frames, dtype=np.uint8)
return Video.from_frames(video_frames, fps=16.0)
|
unload
Release the diffusion pipeline so the next generate_video() re-initializes.
Source code in src/videopython/ai/generation/video.py
| def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
TextToImage
TextToImage
Generates images from text descriptions using local models.
Source code in src/videopython/ai/generation/image.py
| class TextToImage:
"""Generates images from text descriptions using local models."""
def __init__(self, device: str | None = None):
self.device = device
self._pipeline: Any = None
def _init_local(self) -> None:
"""Initialize local diffusion pipeline."""
import torch
from diffusers import DiffusionPipeline
requested_device = self.device
device = select_device(self.device, mps_allowed=True)
dtype = torch.float16 if device == "cuda" else torch.float32
variant = "fp16" if device == "cuda" else None
model_name = "stabilityai/stable-diffusion-xl-base-1.0"
self._pipeline = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
variant=variant,
use_safetensors=True,
)
self._pipeline.to(device)
self.device = device
log_device_initialization(
"TextToImage",
requested_device=requested_device,
resolved_device=device,
)
if device == "mps":
self._pipeline.enable_attention_slicing()
def generate_image(self, prompt: str) -> Image.Image:
"""Generate an image from a text prompt."""
if self._pipeline is None:
self._init_local()
return self._pipeline(prompt=prompt).images[0]
def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_image() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
generate_image
generate_image(prompt: str) -> Image.Image
Generate an image from a text prompt.
Source code in src/videopython/ai/generation/image.py
| def generate_image(self, prompt: str) -> Image.Image:
"""Generate an image from a text prompt."""
if self._pipeline is None:
self._init_local()
return self._pipeline(prompt=prompt).images[0]
|
unload
Release the diffusion pipeline so the next generate_image() re-initializes.
Source code in src/videopython/ai/generation/image.py
| def unload(self) -> None:
"""Release the diffusion pipeline so the next generate_image() re-initializes."""
self._pipeline = None
release_device_memory(self.device)
|
TextToSpeech
generate_audio accepts three optional Chatterbox generate() knobs —
exaggeration, cfg_weight, and temperature — for callers who want to
shape per-utterance prosody. Each defaults to None, which means "don't pass
the kwarg, let Chatterbox use its default". The dubbing pipeline derives them
per-segment from source vocals RMS via the
Expressiveness dataclass.
from videopython.ai import TextToSpeech
tts = TextToSpeech()
# Chatterbox defaults.
audio = tts.generate_audio("Welcome to videopython.")
# Dramatic delivery (higher exaggeration, lower cfg_weight slows pacing).
dramatic = tts.generate_audio(
"We made it.",
exaggeration=0.85,
cfg_weight=0.35,
)
TextToSpeech
Generates speech audio from text using Chatterbox Multilingual.
Backed by Chatterbox Multilingual (Resemble AI). When voice_sample is
provided to generate_audio, the model clones that voice; otherwise it
falls back to Chatterbox's built-in default speaker.
Source code in src/videopython/ai/generation/audio.py
| class TextToSpeech:
"""Generates speech audio from text using Chatterbox Multilingual.
Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
provided to ``generate_audio``, the model clones that voice; otherwise it
falls back to Chatterbox's built-in default speaker.
"""
SAMPLE_RATE: int = 24000
def __init__(
self,
voice: Audio | None = None,
device: str | None = None,
language: str = "en",
):
self.voice = voice
self.device = device
self.language = language
self._model: Any = None
def _init_local(self) -> None:
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
requested_device = self.device
device = select_device(self.device, mps_allowed=False)
self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
self.device = device
log_device_initialization(
"TextToSpeech",
requested_device=requested_device,
resolved_device=device,
)
def generate_audio(
self,
text: str,
voice_sample: Audio | None = None,
voice_sample_path: str | Path | None = None,
exaggeration: float | None = None,
cfg_weight: float | None = None,
temperature: float | None = None,
) -> Audio:
"""Generate speech audio from text.
Args:
text: Text to synthesize.
voice_sample: Optional voice sample to clone. Falls back to the
instance's ``voice`` and then to Chatterbox's default speaker.
voice_sample_path: Optional pre-encoded WAV path to use directly as
the speaker prompt. Skips the per-call temp-WAV encode that
``voice_sample`` would otherwise trigger. When set, takes
precedence over ``voice_sample`` and ``self.voice``. Used by
the dubbing pipeline to encode each speaker's sample once and
reuse it across all of that speaker's segments.
exaggeration: Chatterbox emotional-intensity knob (default
``0.5``). ``None`` (default) means do not pass the kwarg —
Chatterbox uses its own default and we stay forward-compatible
with changes to it. ``0.7+`` produces dramatic output.
cfg_weight: Chatterbox classifier-free-guidance weight (default
``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
slow pacing.
temperature: Chatterbox sampling temperature (default ``0.8``).
``None`` means do not pass.
"""
import tempfile
from pathlib import Path
import numpy as np
if self._model is None:
self._init_local()
speaker_wav_path: Path | None = None
cleanup_path = False
if voice_sample_path is not None:
speaker_wav_path = Path(voice_sample_path)
else:
effective_sample = voice_sample or self.voice
if effective_sample is not None:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
effective_sample.save(f.name)
speaker_wav_path = Path(f.name)
cleanup_path = True
# Only forward knobs the caller explicitly set. Passing nothing
# for a knob lets Chatterbox use its own default — important so a
# future Chatterbox default change doesn't get pinned by us.
knobs: dict[str, float] = {}
if exaggeration is not None:
knobs["exaggeration"] = exaggeration
if cfg_weight is not None:
knobs["cfg_weight"] = cfg_weight
if temperature is not None:
knobs["temperature"] = temperature
try:
wav = self._model.generate(
text=text,
language_id=self.language,
audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
**knobs,
)
audio_data = wav.cpu().float().numpy().squeeze()
if audio_data.ndim == 0:
audio_data = np.array([audio_data], dtype=np.float32)
metadata = AudioMetadata(
sample_rate=self.SAMPLE_RATE,
channels=1,
sample_width=2,
duration_seconds=len(audio_data) / self.SAMPLE_RATE,
frame_count=len(audio_data),
)
return Audio(audio_data, metadata)
finally:
if cleanup_path and speaker_wav_path is not None:
speaker_wav_path.unlink(missing_ok=True)
def unload(self) -> None:
"""Release the TTS model so the next generate_audio() re-initializes.
Used by low-memory dubbing to free VRAM between pipeline stages.
"""
self._model = None
release_device_memory(self.device)
|
generate_audio
generate_audio(
text: str,
voice_sample: Audio | None = None,
voice_sample_path: str | Path | None = None,
exaggeration: float | None = None,
cfg_weight: float | None = None,
temperature: float | None = None,
) -> Audio
Generate speech audio from text.
Parameters:
| Name |
Type |
Description |
Default |
text
|
str
|
|
required
|
voice_sample
|
Audio | None
|
Optional voice sample to clone. Falls back to the
instance's voice and then to Chatterbox's default speaker.
|
None
|
voice_sample_path
|
str | Path | None
|
Optional pre-encoded WAV path to use directly as
the speaker prompt. Skips the per-call temp-WAV encode that
voice_sample would otherwise trigger. When set, takes
precedence over voice_sample and self.voice. Used by
the dubbing pipeline to encode each speaker's sample once and
reuse it across all of that speaker's segments.
|
None
|
exaggeration
|
float | None
|
Chatterbox emotional-intensity knob (default
0.5). None (default) means do not pass the kwarg —
Chatterbox uses its own default and we stay forward-compatible
with changes to it. 0.7+ produces dramatic output.
|
None
|
cfg_weight
|
float | None
|
Chatterbox classifier-free-guidance weight (default
0.5). None means do not pass. Lower values (~0.3)
slow pacing.
|
None
|
temperature
|
float | None
|
Chatterbox sampling temperature (default 0.8).
None means do not pass.
|
None
|
Source code in src/videopython/ai/generation/audio.py
| def generate_audio(
self,
text: str,
voice_sample: Audio | None = None,
voice_sample_path: str | Path | None = None,
exaggeration: float | None = None,
cfg_weight: float | None = None,
temperature: float | None = None,
) -> Audio:
"""Generate speech audio from text.
Args:
text: Text to synthesize.
voice_sample: Optional voice sample to clone. Falls back to the
instance's ``voice`` and then to Chatterbox's default speaker.
voice_sample_path: Optional pre-encoded WAV path to use directly as
the speaker prompt. Skips the per-call temp-WAV encode that
``voice_sample`` would otherwise trigger. When set, takes
precedence over ``voice_sample`` and ``self.voice``. Used by
the dubbing pipeline to encode each speaker's sample once and
reuse it across all of that speaker's segments.
exaggeration: Chatterbox emotional-intensity knob (default
``0.5``). ``None`` (default) means do not pass the kwarg —
Chatterbox uses its own default and we stay forward-compatible
with changes to it. ``0.7+`` produces dramatic output.
cfg_weight: Chatterbox classifier-free-guidance weight (default
``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
slow pacing.
temperature: Chatterbox sampling temperature (default ``0.8``).
``None`` means do not pass.
"""
import tempfile
from pathlib import Path
import numpy as np
if self._model is None:
self._init_local()
speaker_wav_path: Path | None = None
cleanup_path = False
if voice_sample_path is not None:
speaker_wav_path = Path(voice_sample_path)
else:
effective_sample = voice_sample or self.voice
if effective_sample is not None:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
effective_sample.save(f.name)
speaker_wav_path = Path(f.name)
cleanup_path = True
# Only forward knobs the caller explicitly set. Passing nothing
# for a knob lets Chatterbox use its own default — important so a
# future Chatterbox default change doesn't get pinned by us.
knobs: dict[str, float] = {}
if exaggeration is not None:
knobs["exaggeration"] = exaggeration
if cfg_weight is not None:
knobs["cfg_weight"] = cfg_weight
if temperature is not None:
knobs["temperature"] = temperature
try:
wav = self._model.generate(
text=text,
language_id=self.language,
audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
**knobs,
)
audio_data = wav.cpu().float().numpy().squeeze()
if audio_data.ndim == 0:
audio_data = np.array([audio_data], dtype=np.float32)
metadata = AudioMetadata(
sample_rate=self.SAMPLE_RATE,
channels=1,
sample_width=2,
duration_seconds=len(audio_data) / self.SAMPLE_RATE,
frame_count=len(audio_data),
)
return Audio(audio_data, metadata)
finally:
if cleanup_path and speaker_wav_path is not None:
speaker_wav_path.unlink(missing_ok=True)
|
unload
Release the TTS model so the next generate_audio() re-initializes.
Used by low-memory dubbing to free VRAM between pipeline stages.
Source code in src/videopython/ai/generation/audio.py
| def unload(self) -> None:
"""Release the TTS model so the next generate_audio() re-initializes.
Used by low-memory dubbing to free VRAM between pipeline stages.
"""
self._model = None
release_device_memory(self.device)
|
TextToMusic
TextToMusic
Generates music from text descriptions using MusicGen.
Source code in src/videopython/ai/generation/audio.py
| class TextToMusic:
"""Generates music from text descriptions using MusicGen."""
def __init__(self, device: str | None = None):
self.device = device
self._processor: Any = None
self._model: Any = None
def _init_local(self) -> None:
"""Initialize local MusicGen model."""
import os
from transformers import AutoProcessor, MusicgenForConditionalGeneration
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
requested_device = self.device
device = select_device(self.device, mps_allowed=True)
model_name = "facebook/musicgen-small"
self._processor = AutoProcessor.from_pretrained(model_name)
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
self._model.to(device)
self.device = device
log_device_initialization(
"TextToMusic",
requested_device=requested_device,
resolved_device=device,
)
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
"""Generate music audio from text description."""
if self._model is None:
self._init_local()
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
sampling_rate = self._model.config.audio_encoder.sampling_rate
audio_data = audio_values[0, 0].cpu().float().numpy()
metadata = AudioMetadata(
sample_rate=sampling_rate,
channels=1,
sample_width=2,
duration_seconds=len(audio_data) / sampling_rate,
frame_count=len(audio_data),
)
return Audio(audio_data, metadata)
def unload(self) -> None:
"""Release the MusicGen model so the next generate_audio() re-initializes."""
self._model = None
self._processor = None
release_device_memory(self.device)
|
generate_audio
generate_audio(
text: str, max_new_tokens: int = 256
) -> Audio
Generate music audio from text description.
Source code in src/videopython/ai/generation/audio.py
| def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
"""Generate music audio from text description."""
if self._model is None:
self._init_local()
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
sampling_rate = self._model.config.audio_encoder.sampling_rate
audio_data = audio_values[0, 0].cpu().float().numpy()
metadata = AudioMetadata(
sample_rate=sampling_rate,
channels=1,
sample_width=2,
duration_seconds=len(audio_data) / sampling_rate,
frame_count=len(audio_data),
)
return Audio(audio_data, metadata)
|
unload
Release the MusicGen model so the next generate_audio() re-initializes.
Source code in src/videopython/ai/generation/audio.py
| def unload(self) -> None:
"""Release the MusicGen model so the next generate_audio() re-initializes."""
self._model = None
self._processor = None
release_device_memory(self.device)
|