AI Generation

Generate videos, images, audio, and music from text prompts.

Local Model Support

Class	Local Model Family
TextToVideo	Wan2.2-T2V-A14B
ImageToVideo	Wan2.2-I2V-A14B
TextToSpeech	Chatterbox Multilingual
TextToMusic	MusicGen
TextToImage	Qwen-Image

TextToVideo

Bases: ManagedPredictor

Generates videos from text descriptions using Wan2.2-T2V (Apache-2.0).

Source code in src/videopython/ai/generation/video.py

class TextToVideo(ManagedPredictor):
    """Generates videos from text descriptions using Wan2.2-T2V (Apache-2.0)."""

    _model_attrs = ("_pipeline",)

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        import torch

        from videopython.ai._optional import require

        diffusers = require("diffusers", feature="TextToVideo")
        WanPipeline = diffusers.WanPipeline
        AutoencoderKLWan = diffusers.AutoencoderKLWan

        requested_device = self.device
        device = _require_cuda_device(self.device, "TextToVideo")

        model_name = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
        revision = pinned(model_name)
        # VAE stays float32 for decode quality; the transformer/pipeline use bf16.
        vae = AutoencoderKLWan.from_pretrained(
            model_name, subfolder="vae", revision=revision, torch_dtype=torch.float32
        )
        self._pipeline = WanPipeline.from_pretrained(model_name, vae=vae, revision=revision, torch_dtype=torch.bfloat16)
        _retie_umt5_embeddings(self._pipeline)
        # A14B is a MoE (high+low-noise experts), too large for a single-GPU .to("cuda");
        # offload submodules on demand (offload manages placement, so do NOT also call .to).
        self._pipeline.enable_model_cpu_offload()

        self.device = device
        log_device_initialization(
            "TextToVideo",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_video(
        self,
        prompt: str,
        num_steps: int = 40,
        num_frames: int = 81,
        guidance_scale: float = 4.0,
    ) -> Video:
        """Generate video from text prompt."""
        import torch

        if self._pipeline is None:
            self._init_local()

        # output_type="pil" is required: the "np" default returns float32 in [0, 1],
        # which the uint8 cast below would floor to an all-black video.
        video_frames = self._pipeline(
            prompt=prompt,
            negative_prompt=_WAN_NEGATIVE_PROMPT,
            height=720,
            width=1280,
            num_frames=num_frames,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            guidance_scale_2=3.0,  # low-noise MoE expert; None would reuse guidance_scale
            output_type="pil",
            generator=torch.Generator(device=self.device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

generate_video

generate_video(
    prompt: str,
    num_steps: int = 40,
    num_frames: int = 81,
    guidance_scale: float = 4.0,
) -> Video

Generate video from text prompt.

Source code in src/videopython/ai/generation/video.py

def generate_video(
    self,
    prompt: str,
    num_steps: int = 40,
    num_frames: int = 81,
    guidance_scale: float = 4.0,
) -> Video:
    """Generate video from text prompt."""
    import torch

    if self._pipeline is None:
        self._init_local()

    # output_type="pil" is required: the "np" default returns float32 in [0, 1],
    # which the uint8 cast below would floor to an all-black video.
    video_frames = self._pipeline(
        prompt=prompt,
        negative_prompt=_WAN_NEGATIVE_PROMPT,
        height=720,
        width=1280,
        num_frames=num_frames,
        num_inference_steps=num_steps,
        guidance_scale=guidance_scale,
        guidance_scale_2=3.0,  # low-noise MoE expert; None would reuse guidance_scale
        output_type="pil",
        generator=torch.Generator(device=self.device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

ImageToVideo

Bases: ManagedPredictor

Generates videos from static images using Wan2.2-I2V (Apache-2.0).

Source code in src/videopython/ai/generation/video.py

class ImageToVideo(ManagedPredictor):
    """Generates videos from static images using Wan2.2-I2V (Apache-2.0)."""

    _model_attrs = ("_pipeline",)

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        import torch

        from videopython.ai._optional import require

        diffusers = require("diffusers", feature="ImageToVideo")
        WanImageToVideoPipeline = diffusers.WanImageToVideoPipeline
        AutoencoderKLWan = diffusers.AutoencoderKLWan

        requested_device = self.device
        device = _require_cuda_device(self.device, "ImageToVideo")

        model_name = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
        revision = pinned(model_name)
        vae = AutoencoderKLWan.from_pretrained(
            model_name, subfolder="vae", revision=revision, torch_dtype=torch.float32
        )
        self._pipeline = WanImageToVideoPipeline.from_pretrained(
            model_name, vae=vae, revision=revision, torch_dtype=torch.bfloat16
        )
        _retie_umt5_embeddings(self._pipeline)
        # A14B MoE is too large for a single-GPU .to("cuda"); offload submodules on demand.
        self._pipeline.enable_model_cpu_offload()

        self.device = device
        log_device_initialization(
            "ImageToVideo",
            requested_device=requested_device,
            resolved_device=device,
        )

    def _resize_to_model_grid(self, image: Image) -> tuple[Image, int, int]:
        """Resize ``image`` to Wan's area budget, snapped to the model's spatial grid.

        Returns the resized image plus the ``(height, width)`` to request, derived
        from the input aspect ratio against Wan's 480x832 area (per the model card).
        """
        max_area = 480 * 832
        aspect_ratio = image.height / image.width
        mod_value = self._pipeline.vae_scale_factor_spatial * self._pipeline.transformer.config.patch_size[1]
        height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
        width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
        return image.resize((width, height)), height, width

    def generate_video(
        self,
        image: Image,
        prompt: str = "",
        num_steps: int = 40,
        num_frames: int = 81,
        guidance_scale: float = 3.5,
    ) -> Video:
        """Generate video animation from a static image."""
        import torch

        if self._pipeline is None:
            self._init_local()

        image, height, width = self._resize_to_model_grid(image)
        video_frames = self._pipeline(
            image=image,
            prompt=prompt,
            negative_prompt=_WAN_NEGATIVE_PROMPT,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            # No guidance_scale_2: the low-noise MoE expert reuses guidance_scale (its None default).
            output_type="pil",
            generator=torch.Generator(device=self.device).manual_seed(42),
        ).frames[0]
        video_frames = np.asarray(video_frames, dtype=np.uint8)
        return Video.from_frames(video_frames, fps=16.0)

generate_video

generate_video(
    image: Image,
    prompt: str = "",
    num_steps: int = 40,
    num_frames: int = 81,
    guidance_scale: float = 3.5,
) -> Video

Generate video animation from a static image.

Source code in src/videopython/ai/generation/video.py

def generate_video(
    self,
    image: Image,
    prompt: str = "",
    num_steps: int = 40,
    num_frames: int = 81,
    guidance_scale: float = 3.5,
) -> Video:
    """Generate video animation from a static image."""
    import torch

    if self._pipeline is None:
        self._init_local()

    image, height, width = self._resize_to_model_grid(image)
    video_frames = self._pipeline(
        image=image,
        prompt=prompt,
        negative_prompt=_WAN_NEGATIVE_PROMPT,
        height=height,
        width=width,
        num_frames=num_frames,
        num_inference_steps=num_steps,
        guidance_scale=guidance_scale,
        # No guidance_scale_2: the low-noise MoE expert reuses guidance_scale (its None default).
        output_type="pil",
        generator=torch.Generator(device=self.device).manual_seed(42),
    ).frames[0]
    video_frames = np.asarray(video_frames, dtype=np.uint8)
    return Video.from_frames(video_frames, fps=16.0)

TextToImage

Bases: ManagedPredictor

Generates images from text descriptions using local models (Qwen-Image, Apache-2.0).

Source code in src/videopython/ai/generation/image.py

class TextToImage(ManagedPredictor):
    """Generates images from text descriptions using local models (Qwen-Image, Apache-2.0)."""

    _model_attrs = ("_pipeline",)

    def __init__(self, device: str | None = None):
        self.device = device
        self._pipeline: Any = None

    def _init_local(self) -> None:
        """Initialize the local Qwen-Image diffusion pipeline (CUDA-only)."""
        import torch

        from videopython.ai._optional import require

        requested_device = self.device
        device = select_device(self.device, mps_allowed=False)
        if device != "cuda":
            raise RuntimeError("TextToImage requires a CUDA GPU; Qwen-Image (~20B) is impractical on CPU/MPS.")

        QwenImagePipeline = require("diffusers", feature="TextToImage").QwenImagePipeline
        self._pipeline = QwenImagePipeline.from_pretrained(
            _MODEL_NAME,
            revision=pinned(_MODEL_NAME),
            torch_dtype=torch.bfloat16,
            use_safetensors=True,
        )
        # ~20B params (Qwen2.5-VL text encoder + transformer + VAE). Offload submodules
        # to the GPU on demand so it fits a single GPU; offload manages device placement,
        # so we must NOT also call .to("cuda").
        self._pipeline.enable_model_cpu_offload()
        self._pipeline.enable_vae_tiling()

        self.device = device
        log_device_initialization(
            "TextToImage",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_image(
        self,
        prompt: str,
        *,
        negative_prompt: str = " ",
        true_cfg_scale: float = 4.0,
        num_inference_steps: int = 50,
        width: int = 1328,
        height: int = 1328,
        add_magic: bool = True,
        seed: int = 42,
    ) -> Image.Image:
        """Generate an image from a text prompt.

        Qwen-Image uses ``true_cfg_scale`` (not ``guidance_scale``) for
        classifier-free guidance; a non-empty ``negative_prompt`` (default a single
        space) is required to enable it. ``add_magic`` appends the model's
        recommended quality suffix to ``prompt``.
        """
        import torch

        if self._pipeline is None:
            self._init_local()

        full_prompt = prompt + _POSITIVE_MAGIC if add_magic else prompt
        return self._pipeline(
            prompt=full_prompt,
            negative_prompt=negative_prompt,
            true_cfg_scale=true_cfg_scale,
            num_inference_steps=num_inference_steps,
            width=width,
            height=height,
            generator=torch.Generator(device=self.device).manual_seed(seed),
        ).images[0]

generate_image

generate_image(
    prompt: str,
    *,
    negative_prompt: str = " ",
    true_cfg_scale: float = 4.0,
    num_inference_steps: int = 50,
    width: int = 1328,
    height: int = 1328,
    add_magic: bool = True,
    seed: int = 42,
) -> Image.Image

Generate an image from a text prompt.

Qwen-Image uses true_cfg_scale (not guidance_scale) for classifier-free guidance; a non-empty negative_prompt (default a single space) is required to enable it. add_magic appends the model's recommended quality suffix to prompt.

Source code in src/videopython/ai/generation/image.py

def generate_image(
    self,
    prompt: str,
    *,
    negative_prompt: str = " ",
    true_cfg_scale: float = 4.0,
    num_inference_steps: int = 50,
    width: int = 1328,
    height: int = 1328,
    add_magic: bool = True,
    seed: int = 42,
) -> Image.Image:
    """Generate an image from a text prompt.

    Qwen-Image uses ``true_cfg_scale`` (not ``guidance_scale``) for
    classifier-free guidance; a non-empty ``negative_prompt`` (default a single
    space) is required to enable it. ``add_magic`` appends the model's
    recommended quality suffix to ``prompt``.
    """
    import torch

    if self._pipeline is None:
        self._init_local()

    full_prompt = prompt + _POSITIVE_MAGIC if add_magic else prompt
    return self._pipeline(
        prompt=full_prompt,
        negative_prompt=negative_prompt,
        true_cfg_scale=true_cfg_scale,
        num_inference_steps=num_inference_steps,
        width=width,
        height=height,
        generator=torch.Generator(device=self.device).manual_seed(seed),
    ).images[0]

TextToSpeech

generate_audio accepts three optional Chatterbox generate() knobs — exaggeration, cfg_weight, and temperature — for callers who want to shape per-utterance prosody. Each defaults to None, which means "don't pass the kwarg, let Chatterbox use its default". The dubbing pipeline derives them per-segment from source vocals RMS via the Expressiveness dataclass.

from videopython.ai import TextToSpeech

tts = TextToSpeech()

# Chatterbox defaults.
audio = tts.generate_audio("Welcome to videopython.")

# Dramatic delivery (higher exaggeration, lower cfg_weight slows pacing).
dramatic = tts.generate_audio(
    "We made it.",
    exaggeration=0.85,
    cfg_weight=0.35,
)

TextToSpeech

Bases: ManagedPredictor

Generates speech audio from text using Chatterbox Multilingual.

Backed by Chatterbox Multilingual (Resemble AI). When voice_sample is provided to generate_audio, the model clones that voice; otherwise it falls back to Chatterbox's built-in default speaker.

Source code in src/videopython/ai/generation/audio.py

class TextToSpeech(ManagedPredictor):
    """Generates speech audio from text using Chatterbox Multilingual.

    Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
    provided to ``generate_audio``, the model clones that voice; otherwise it
    falls back to Chatterbox's built-in default speaker.
    """

    SAMPLE_RATE: int = 24000

    def __init__(
        self,
        voice: Audio | None = None,
        device: str | None = None,
        language: str = "en",
    ):
        self.voice = voice
        self.device = device
        self.language = language
        self._model: Any = None

    def _init_local(self) -> None:
        from videopython.ai._optional import require

        ChatterboxMultilingualTTS = require("chatterbox.mtl_tts", feature="TextToSpeech").ChatterboxMultilingualTTS

        requested_device = self.device
        device = select_device(self.device, mps_allowed=False)

        # No repo id to key a revision on: Chatterbox resolves its own repo +
        # revision internally, so there is nothing to pass revision= to.
        self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
        self.device = device
        log_device_initialization(
            "TextToSpeech",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_audio(
        self,
        text: str,
        voice_sample: Audio | None = None,
        voice_sample_path: str | Path | None = None,
        exaggeration: float | None = None,
        cfg_weight: float | None = None,
        temperature: float | None = None,
    ) -> Audio:
        """Generate speech audio from text.

        Args:
            text: Text to synthesize.
            voice_sample: Optional voice sample to clone. Falls back to the
                instance's ``voice`` and then to Chatterbox's default speaker.
            voice_sample_path: Optional pre-encoded WAV path to use directly as
                the speaker prompt. Skips the per-call temp-WAV encode that
                ``voice_sample`` would otherwise trigger. When set, takes
                precedence over ``voice_sample`` and ``self.voice``. Used by
                the dubbing pipeline to encode each speaker's sample once and
                reuse it across all of that speaker's segments.
            exaggeration: Chatterbox emotional-intensity knob (default
                ``0.5``). ``None`` (default) means do not pass the kwarg —
                Chatterbox uses its own default and we stay forward-compatible
                with changes to it. ``0.7+`` produces dramatic output.
            cfg_weight: Chatterbox classifier-free-guidance weight (default
                ``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
                slow pacing.
            temperature: Chatterbox sampling temperature (default ``0.8``).
                ``None`` means do not pass.
        """
        import tempfile
        from pathlib import Path

        import numpy as np

        if self._model is None:
            self._init_local()

        speaker_wav_path: Path | None = None
        cleanup_path = False

        if voice_sample_path is not None:
            speaker_wav_path = Path(voice_sample_path)
        else:
            effective_sample = voice_sample or self.voice
            if effective_sample is not None:
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                    effective_sample.save(f.name)
                    speaker_wav_path = Path(f.name)
                    cleanup_path = True

        # Only forward knobs the caller explicitly set. Passing nothing
        # for a knob lets Chatterbox use its own default — important so a
        # future Chatterbox default change doesn't get pinned by us.
        knobs: dict[str, float] = {}
        if exaggeration is not None:
            knobs["exaggeration"] = exaggeration
        if cfg_weight is not None:
            knobs["cfg_weight"] = cfg_weight
        if temperature is not None:
            knobs["temperature"] = temperature

        try:
            wav = self._model.generate(
                text=text,
                language_id=self.language,
                audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
                **knobs,
            )

            audio_data = wav.cpu().float().numpy().squeeze()
            if audio_data.ndim == 0:
                audio_data = np.array([audio_data], dtype=np.float32)

            metadata = AudioMetadata(
                sample_rate=self.SAMPLE_RATE,
                channels=1,
                sample_width=2,
                duration_seconds=len(audio_data) / self.SAMPLE_RATE,
                frame_count=len(audio_data),
            )
            return Audio(audio_data, metadata)
        finally:
            if cleanup_path and speaker_wav_path is not None:
                speaker_wav_path.unlink(missing_ok=True)

generate_audio

generate_audio(
    text: str,
    voice_sample: Audio | None = None,
    voice_sample_path: str | Path | None = None,
    exaggeration: float | None = None,
    cfg_weight: float | None = None,
    temperature: float | None = None,
) -> Audio

Generate speech audio from text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to synthesize.	required
`voice_sample`	`Audio \| None`	Optional voice sample to clone. Falls back to the instance's `voice` and then to Chatterbox's default speaker.	`None`
`voice_sample_path`	`str \| Path \| None`	Optional pre-encoded WAV path to use directly as the speaker prompt. Skips the per-call temp-WAV encode that `voice_sample` would otherwise trigger. When set, takes precedence over `voice_sample` and `self.voice`. Used by the dubbing pipeline to encode each speaker's sample once and reuse it across all of that speaker's segments.	`None`
`exaggeration`	`float \| None`	Chatterbox emotional-intensity knob (default `0.5`). `None` (default) means do not pass the kwarg — Chatterbox uses its own default and we stay forward-compatible with changes to it. `0.7+` produces dramatic output.	`None`
`cfg_weight`	`float \| None`	Chatterbox classifier-free-guidance weight (default `0.5`). `None` means do not pass. Lower values (~`0.3`) slow pacing.	`None`
`temperature`	`float \| None`	Chatterbox sampling temperature (default `0.8`). `None` means do not pass.	`None`

Source code in src/videopython/ai/generation/audio.py

def generate_audio(
    self,
    text: str,
    voice_sample: Audio | None = None,
    voice_sample_path: str | Path | None = None,
    exaggeration: float | None = None,
    cfg_weight: float | None = None,
    temperature: float | None = None,
) -> Audio:
    """Generate speech audio from text.

    Args:
        text: Text to synthesize.
        voice_sample: Optional voice sample to clone. Falls back to the
            instance's ``voice`` and then to Chatterbox's default speaker.
        voice_sample_path: Optional pre-encoded WAV path to use directly as
            the speaker prompt. Skips the per-call temp-WAV encode that
            ``voice_sample`` would otherwise trigger. When set, takes
            precedence over ``voice_sample`` and ``self.voice``. Used by
            the dubbing pipeline to encode each speaker's sample once and
            reuse it across all of that speaker's segments.
        exaggeration: Chatterbox emotional-intensity knob (default
            ``0.5``). ``None`` (default) means do not pass the kwarg —
            Chatterbox uses its own default and we stay forward-compatible
            with changes to it. ``0.7+`` produces dramatic output.
        cfg_weight: Chatterbox classifier-free-guidance weight (default
            ``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
            slow pacing.
        temperature: Chatterbox sampling temperature (default ``0.8``).
            ``None`` means do not pass.
    """
    import tempfile
    from pathlib import Path

    import numpy as np

    if self._model is None:
        self._init_local()

    speaker_wav_path: Path | None = None
    cleanup_path = False

    if voice_sample_path is not None:
        speaker_wav_path = Path(voice_sample_path)
    else:
        effective_sample = voice_sample or self.voice
        if effective_sample is not None:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                effective_sample.save(f.name)
                speaker_wav_path = Path(f.name)
                cleanup_path = True

    # Only forward knobs the caller explicitly set. Passing nothing
    # for a knob lets Chatterbox use its own default — important so a
    # future Chatterbox default change doesn't get pinned by us.
    knobs: dict[str, float] = {}
    if exaggeration is not None:
        knobs["exaggeration"] = exaggeration
    if cfg_weight is not None:
        knobs["cfg_weight"] = cfg_weight
    if temperature is not None:
        knobs["temperature"] = temperature

    try:
        wav = self._model.generate(
            text=text,
            language_id=self.language,
            audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
            **knobs,
        )

        audio_data = wav.cpu().float().numpy().squeeze()
        if audio_data.ndim == 0:
            audio_data = np.array([audio_data], dtype=np.float32)

        metadata = AudioMetadata(
            sample_rate=self.SAMPLE_RATE,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / self.SAMPLE_RATE,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)
    finally:
        if cleanup_path and speaker_wav_path is not None:
            speaker_wav_path.unlink(missing_ok=True)

TextToMusic

Bases: ManagedPredictor

Generates music from text descriptions using MusicGen.

Source code in src/videopython/ai/generation/audio.py

class TextToMusic(ManagedPredictor):
    """Generates music from text descriptions using MusicGen."""

    _model_attrs = ("_model", "_processor")

    def __init__(self, device: str | None = None):
        self.device = device
        self._processor: Any = None
        self._model: Any = None

    def _init_local(self) -> None:
        """Initialize local MusicGen model."""
        import os

        from videopython.ai._optional import require

        _transformers = require("transformers", feature="TextToMusic")
        AutoProcessor = _transformers.AutoProcessor
        MusicgenForConditionalGeneration = _transformers.MusicgenForConditionalGeneration

        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

        requested_device = self.device
        device = select_device(self.device, mps_allowed=True)

        model_name = "facebook/musicgen-small"
        self._processor = AutoProcessor.from_pretrained(model_name, revision=pinned(model_name))
        self._model = MusicgenForConditionalGeneration.from_pretrained(model_name, revision=pinned(model_name))
        self._model.to(device)
        self.device = device
        log_device_initialization(
            "TextToMusic",
            requested_device=requested_device,
            resolved_device=device,
        )

    def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
        """Generate music audio from text description."""
        if self._model is None:
            self._init_local()

        inputs = self._processor(text=[text], padding=True, return_tensors="pt")
        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
        audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
        sampling_rate = self._model.config.audio_encoder.sampling_rate

        audio_data = audio_values[0, 0].cpu().float().numpy()

        metadata = AudioMetadata(
            sample_rate=sampling_rate,
            channels=1,
            sample_width=2,
            duration_seconds=len(audio_data) / sampling_rate,
            frame_count=len(audio_data),
        )
        return Audio(audio_data, metadata)

generate_audio

generate_audio(
    text: str, max_new_tokens: int = 256
) -> Audio

Generate music audio from text description.

Source code in src/videopython/ai/generation/audio.py

def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
    """Generate music audio from text description."""
    if self._model is None:
        self._init_local()

    inputs = self._processor(text=[text], padding=True, return_tensors="pt")
    inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
    audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
    sampling_rate = self._model.config.audio_encoder.sampling_rate

    audio_data = audio_values[0, 0].cpu().float().numpy()

    metadata = AudioMetadata(
        sample_rate=sampling_rate,
        channels=1,
        sample_width=2,
        duration_seconds=len(audio_data) / sampling_rate,
        frame_count=len(audio_data),
    )
    return Audio(audio_data, metadata)