Audio

Core audio class for loading, manipulating, analyzing, and saving audio files.

Audio

The Audio class handles audio data with numpy arrays, supporting operations like slicing, concatenation, overlay mixing, resampling, analysis, and format conversion.

from videopython.base import Audio

# Load from file
audio = Audio.from_path("music.mp3")

# Create silent track
silent = Audio.create_silent(duration_seconds=5.0, stereo=True)

# Basic operations
mono = audio.to_mono()
resampled = audio.resample(16000)
segment = audio.slice(start_seconds=1.0, end_seconds=5.0)

# Combine audio
combined = audio1.concat(audio2, crossfade=0.5)
mixed = audio1.overlay(audio2, position=2.0)

# Save
audio.save("output.wav")

Audio

A class to handle audio data with numpy arrays

Attributes:

Name	Type	Description
`data`	`ndarray`	Audio data as a numpy array, normalized between -1 and 1
`metadata`	`AudioMetadata`	Metadata about the audio file

Source code in src/videopython/base/audio/audio.py

class Audio:
    """
    A class to handle audio data with numpy arrays

    Attributes:
        data (np.ndarray): Audio data as a numpy array, normalized between -1 and 1
        metadata (AudioMetadata): Metadata about the audio file
    """

    def __init__(self, data: np.ndarray, metadata: AudioMetadata):
        """
        Initialize Audio object

        Args:
            data: Audio data as numpy array, normalized between -1 and 1
            metadata: AudioMetadata object containing audio properties
        """
        self.data = data
        self.metadata = metadata

    @property
    def is_silent(self) -> bool:
        """
        Check if the audio track is silent (all samples are effectively zero)

        Returns:
            bool: True if the audio is silent, False otherwise
        """
        # Use a small threshold to account for floating-point precision
        return bool(np.all(np.abs(self.data) < 1e-7))

    @staticmethod
    def _get_ffmpeg_info(file_path: Path) -> dict:
        """Get audio metadata using ffprobe"""
        cmd = [
            "ffprobe",
            "-v",
            "quiet",
            "-print_format",
            "json",
            "-show_format",
            "-show_streams",
            str(file_path),
        ]

        try:
            output = subprocess.check_output(cmd)
            info = json.loads(output.decode())

            # Find the audio stream
            audio_stream = None
            for stream in info["streams"]:
                if stream["codec_type"] == "audio":
                    audio_stream = stream
                    break

            if audio_stream is None:
                raise AudioLoadError("No audio stream found")

            return {
                "sample_rate": int(audio_stream["sample_rate"]),
                "channels": int(audio_stream["channels"]),
                "duration": float(info["format"]["duration"]),
                "bit_depth": int(audio_stream.get("bits_per_sample", 16)),
            }
        except subprocess.CalledProcessError as e:
            raise AudioLoadError(f"Error getting audio info: {e}")

    @classmethod
    def create_silent(
        cls, duration_seconds: float, stereo: bool = True, sample_rate: int = 44100, sample_width: int = 2
    ) -> Audio:
        """
        Create a silent audio track.

        Args:
            duration_seconds: Length of the silent track in seconds
            stereo: If True, create stereo track; if False, create mono track (default: True)
            sample_rate: Sample rate in Hz (default: 44100)
            sample_width: Sample width in bytes (default: 2, which is 16-bit)

        Returns:
            Audio: New Audio instance with silent track

        Raises:
            ValueError: If duration is negative or other parameters are invalid
        """
        if duration_seconds <= 0:
            raise ValueError("Duration must be positive")
        if sample_rate <= 0:
            raise ValueError("Sample rate must be positive")
        if sample_width not in {1, 2, 4}:
            raise ValueError("Sample width must be 1, 2, or 4 bytes")

        # Calculate number of frames
        frame_count = int(duration_seconds * sample_rate)

        # Create silent data array
        channels = 2 if stereo else 1
        shape = (frame_count, channels) if stereo else (frame_count,)
        data = np.zeros(shape, dtype=np.float32)

        # Create metadata
        metadata = AudioMetadata(
            sample_rate=sample_rate,
            channels=channels,
            sample_width=sample_width,
            duration_seconds=duration_seconds,
            frame_count=frame_count,
        )

        return cls(data, metadata)

    @classmethod
    def from_path(cls, file_path: str | Path) -> Audio:
        """
        Load audio from a file using ffmpeg

        Args:
            file_path: Path to the audio file

        Returns:
            Audio: New Audio instance

        Raises:
            FileNotFoundError: If the file doesn't exist
            AudioLoadError: If there's an error loading the audio
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        # Get audio info
        info = cls._get_ffmpeg_info(file_path)

        # Convert to WAV using ffmpeg
        cmd = [
            "ffmpeg",
            "-i",
            str(file_path),
            "-f",
            "wav",
            "-ar",
            str(info["sample_rate"]),  # sample rate
            "-ac",
            str(info["channels"]),  # channels
            "-bits_per_raw_sample",
            str(info["bit_depth"]),
            "-",  # Output to stdout
        ]

        try:
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            wav_data, stderr = process.communicate()

            if process.returncode != 0:
                raise AudioLoadError(f"FFmpeg error: {stderr.decode()}")

            # Read WAV data
            with io.BytesIO(wav_data) as wav_io:
                with wave.open(wav_io, "rb") as wav_file:
                    # Get WAV metadata
                    sample_width = wav_file.getsampwidth()
                    channels = wav_file.getnchannels()
                    sample_rate = wav_file.getframerate()
                    n_frames = wav_file.getnframes()

                    # Read raw audio data
                    raw_data = wav_file.readframes(n_frames)

                    # Convert bytes to numpy array based on sample width
                    dtype_map = {1: np.int8, 2: np.int16, 4: np.int32}
                    dtype = dtype_map.get(sample_width)
                    if dtype is None:
                        raise AudioLoadError(f"Unsupported sample width: {sample_width}")

                    data = np.frombuffer(raw_data, dtype=dtype)

                    # Reshape if stereo
                    if channels == 2:
                        data = data.reshape(-1, 2)

                    # Convert to float32
                    data = data.astype(np.float32)

                    # Reshape before normalization if stereo
                    if channels == 2:
                        data = data.reshape(-1, 2)

                    # Normalize to float between -1 and 1
                    max_value = float(np.iinfo(dtype).max)  # type: ignore
                    data = data / max_value

                    # Ensure normalization is within bounds due to floating point precision
                    data = np.clip(data, -1.0, 1.0)

                    # Calculate frame count from actual data length
                    # For stereo, len(data) is already correct after reshape
                    frame_count = len(data)

                    metadata = AudioMetadata(
                        sample_rate=sample_rate,
                        channels=channels,
                        sample_width=sample_width,
                        duration_seconds=info["duration"],
                        frame_count=frame_count,
                    )

                    return cls(data, metadata)

        except subprocess.CalledProcessError as e:
            raise AudioLoadError(f"Error running ffmpeg: {e}")

    @classmethod
    def from_file(cls, file_path: str | Path) -> Audio:
        """Deprecated: Use from_path() instead."""
        import warnings

        warnings.warn(
            "Audio.from_file() is deprecated, use Audio.from_path() instead",
            DeprecationWarning,
            stacklevel=2,
        )
        return cls.from_path(file_path)

    @classmethod
    def silence(
        cls,
        duration: float,
        sample_rate: int = 44100,
        channels: int = 2,
    ) -> Audio:
        """Create a silent audio track.

        Args:
            duration: Duration in seconds.
            sample_rate: Sample rate in Hz. Default: 44100.
            channels: Number of channels (1 for mono, 2 for stereo). Default: 2.

        Returns:
            Audio: Silent audio track with the specified parameters.

        Example:
            >>> silence = Audio.silence(duration=5.0)  # 5 seconds of silence
            >>> silence = Audio.silence(duration=2.0, sample_rate=22050, channels=1)
        """
        frame_count = int(duration * sample_rate)
        data = np.zeros((frame_count, channels), dtype=np.float32)

        metadata = AudioMetadata(
            sample_rate=sample_rate,
            channels=channels,
            sample_width=2,
            duration_seconds=duration,
            frame_count=frame_count,
        )

        return cls(data, metadata)

    def to_mono(self) -> Audio:
        """
        Convert stereo audio to mono by averaging channels

        Returns:
            Audio: New Audio instance with mono audio
        """
        if self.metadata.channels == 1:
            return self

        mono_data = self.data.mean(axis=1)

        new_metadata = AudioMetadata(
            sample_rate=self.metadata.sample_rate,
            channels=1,
            sample_width=self.metadata.sample_width,
            duration_seconds=self.metadata.duration_seconds,
            frame_count=len(mono_data),
        )

        return Audio(mono_data, new_metadata)

    def _to_stereo(self) -> Audio:
        """
        Convert mono audio to stereo by duplicating the channel.
        If already stereo, return self.

        Returns:
            Audio: Stereo version of the audio
        """
        if self.metadata.channels == 2:
            return self

        # Reshape mono data to 2D array and duplicate channel
        stereo_data = np.column_stack((self.data, self.data))

        new_metadata = AudioMetadata(
            sample_rate=self.metadata.sample_rate,
            channels=2,
            sample_width=self.metadata.sample_width,
            duration_seconds=self.metadata.duration_seconds,
            frame_count=len(stereo_data),
        )

        return Audio(stereo_data, new_metadata)

    def get_channel(self, channel: int) -> Audio:
        """
        Extract a single channel from the audio

        Args:
            channel: Channel number (0 for left, 1 for right)

        Returns:
            Audio: New Audio instance with single channel

        Raises:
            ValueError: If channel number is invalid
        """
        if self.metadata.channels == 1:
            return self

        if channel not in [0, 1]:
            raise ValueError("Channel must be 0 (left) or 1 (right)")

        channel_data = self.data[:, channel]

        new_metadata = AudioMetadata(
            sample_rate=self.metadata.sample_rate,
            channels=1,
            sample_width=self.metadata.sample_width,
            duration_seconds=self.metadata.duration_seconds,
            frame_count=len(channel_data),
        )

        return Audio(channel_data, new_metadata)

    def resample(self, target_sample_rate: int) -> Audio:
        """
        Resample the audio to a new sample rate

        Args:
            target_sample_rate: New sample rate in Hz

        Returns:
            Audio: New Audio instance with resampled audio
        """
        if target_sample_rate == self.metadata.sample_rate:
            return self

        # Calculate resampling ratio
        ratio = target_sample_rate / self.metadata.sample_rate

        target_length = round(self.data.shape[0] * ratio)

        audio_array = self.data
        if self.metadata.channels == 1:
            audio_array = audio_array.reshape(-1, 1)

        resampled_data = np.zeros((target_length, self.metadata.channels), dtype=np.float32)

        for channel in range(self.metadata.channels):
            resampled_data[:, channel] = self._resample_channel(audio_array[:, channel], target_length)

        new_metadata = AudioMetadata(
            sample_rate=target_sample_rate,
            channels=self.metadata.channels,
            sample_width=self.metadata.sample_width,
            duration_seconds=target_length / target_sample_rate,
            frame_count=target_length,
        )
        if self.metadata.channels == 1:
            resampled_data = resampled_data.flatten()

        return Audio(resampled_data, new_metadata)

    @staticmethod
    def _resample_channel(data: np.ndarray, new_length: int) -> np.ndarray:
        """Resample a single channel of audio data to a new length"""

        data_fourier = np.fft.rfft(data)
        original_length = data.shape[0]

        newshape = [new_length // 2 + 1]

        data_fourier_placeholder = np.zeros(newshape, data_fourier.dtype)

        min_length = min(new_length, original_length)
        nyquist = min_length // 2 + 1
        sl = [slice(0, nyquist)]
        data_fourier_placeholder[tuple(sl)] = data_fourier[tuple(sl)]

        if min_length % 2 == 0:
            if new_length < original_length:
                sl = [slice(min_length // 2, min_length // 2 + 1)]
                data_fourier_placeholder[tuple(sl)] *= 2.0

                sl = [slice(min_length // 2, min_length // 2 + 1)]
                data_fourier_placeholder[tuple(sl)] *= 0.5

        resampled_data = np.fft.irfft(data_fourier_placeholder, new_length)

        resampled_data *= float(new_length) / float(original_length)

        return resampled_data

    def concat(self, other: Audio, crossfade: float = 0.0) -> Audio:
        """
        Concatenate another audio segment to this one.
        If mixing mono and stereo, converts mono to stereo.

        Args:
            other: Another Audio object to concatenate
            crossfade: Duration of crossfade in seconds (default: 0.0 for no crossfade)

        Returns:
            Audio: New Audio object with concatenated data

        Raises:
            ValueError: If audio metadata doesn't match or crossfade duration is invalid
        """
        if abs(self.metadata.sample_rate - other.metadata.sample_rate) > 0:
            raise ValueError("Sample rates must match")
        if self.metadata.sample_width != other.metadata.sample_width:
            raise ValueError("Sample widths must match")

        # Determine output format (mono or stereo)
        output_stereo = self.metadata.channels == 2 or other.metadata.channels == 2

        # Convert to appropriate format if needed
        first = self._to_stereo() if output_stereo and self.metadata.channels == 1 else self
        second = other._to_stereo() if output_stereo and other.metadata.channels == 1 else other

        if first.metadata.channels != second.metadata.channels:
            raise ValueError("Channel counts must match")

        # Handle case with no crossfade
        if crossfade <= 0:
            if first.metadata.channels == 1:
                concatenated_data = np.concatenate([first.data, second.data])
            else:
                concatenated_data = np.vstack([first.data, second.data])

            new_metadata = AudioMetadata(
                sample_rate=first.metadata.sample_rate,
                channels=first.metadata.channels,
                sample_width=first.metadata.sample_width,
                duration_seconds=first.metadata.duration_seconds + second.metadata.duration_seconds,
                frame_count=len(concatenated_data),
            )

            return Audio(concatenated_data, new_metadata)

        # Validate crossfade duration
        if crossfade > min(first.metadata.duration_seconds, second.metadata.duration_seconds):
            raise ValueError("Crossfade duration cannot exceed duration of either audio segment")

        # Calculate crossfade parameters
        crossfade_samples = int(crossfade * first.metadata.sample_rate)

        # Calculate output length and create output array
        total_samples = len(first.data) + len(second.data) - crossfade_samples
        if first.metadata.channels == 1:
            output = np.zeros(total_samples, dtype=np.float32)
        else:
            output = np.zeros((total_samples, 2), dtype=np.float32)  # type: ignore

        # Copy non-crossfaded portions
        crossfade_start = len(first.data) - crossfade_samples
        output[:crossfade_start] = first.data[:crossfade_start]
        output[crossfade_start + crossfade_samples :] = second.data[crossfade_samples:]

        # Create crossfade ramps
        fade_out = np.linspace(1, 0, crossfade_samples)
        fade_in = np.linspace(0, 1, crossfade_samples)

        # Apply crossfade
        if first.metadata.channels == 1:
            output[crossfade_start : crossfade_start + crossfade_samples] = (
                first.data[crossfade_start:] * fade_out + second.data[:crossfade_samples] * fade_in
            )
        else:
            for channel in range(first.metadata.channels):
                output[crossfade_start : crossfade_start + crossfade_samples, channel] = (
                    first.data[crossfade_start:, channel] * fade_out
                    + second.data[:crossfade_samples, channel] * fade_in
                )

        # Create new metadata
        new_duration = total_samples / first.metadata.sample_rate
        new_metadata = AudioMetadata(
            sample_rate=first.metadata.sample_rate,
            channels=first.metadata.channels,
            sample_width=first.metadata.sample_width,
            duration_seconds=new_duration,
            frame_count=total_samples,
        )

        return Audio(output, new_metadata)

    def slice(self, start_seconds: float = 0.0, end_seconds: float | None = None) -> Audio:
        """
        Extract a portion of the audio between start_seconds and end_seconds.

        Args:
            start_seconds: Start time in seconds (default: 0.0)
            end_seconds: End time in seconds (default: None, meaning end of audio)

        Returns:
            Audio: New Audio instance with the extracted portion

        Raises:
            ValueError: If start_seconds or end_seconds are invalid
        """
        # Validate inputs
        if start_seconds < 0:
            raise ValueError("start_seconds must be non-negative")

        duration_seconds = self.metadata.duration_seconds
        duration_tolerance = 1e-6

        if end_seconds is not None:
            if end_seconds < start_seconds:
                raise ValueError("end_seconds must be greater than start_seconds")
            if end_seconds > duration_seconds + duration_tolerance:
                raise ValueError("end_seconds cannot exceed audio duration")
            end_seconds = min(end_seconds, duration_seconds)
        else:
            end_seconds = duration_seconds

        # Convert seconds to sample indices
        start_idx = int(start_seconds * self.metadata.sample_rate)
        end_idx = int(end_seconds * self.metadata.sample_rate)

        # Extract the portion of audio data
        sliced_data = self.data[start_idx:end_idx]

        # Calculate new duration
        new_duration = (end_idx - start_idx) / self.metadata.sample_rate

        # Create new metadata
        new_metadata = AudioMetadata(
            sample_rate=self.metadata.sample_rate,
            channels=self.metadata.channels,
            sample_width=self.metadata.sample_width,
            duration_seconds=new_duration,
            frame_count=len(sliced_data) if self.metadata.channels == 1 else len(sliced_data),
        )

        return Audio(sliced_data, new_metadata)

    def scale_volume(self, factor: float) -> Audio:
        """
        Scale audio volume by a factor.

        Args:
            factor: Volume multiplier. 1.0 = no change, 0.5 = half volume,
                    2.0 = double volume (may clip).

        Returns:
            Audio: New Audio object with scaled volume.

        Raises:
            ValueError: If factor is negative.
        """
        if factor < 0:
            raise ValueError("Volume factor must be non-negative")

        scaled_data = self.data * factor
        # Clip to prevent overflow
        scaled_data = np.clip(scaled_data, -1.0, 1.0)

        return Audio(scaled_data.astype(np.float32), self.metadata)

    def fit_to_duration(self, target_duration: float) -> "Audio":
        """
        Adjust audio duration to match a target, slicing or padding with silence.

        If audio is longer than target, it will be sliced.
        If audio is shorter than target, silence will be appended.

        Args:
            target_duration: Target duration in seconds.

        Returns:
            Audio: New Audio object with the target duration.

        Raises:
            ValueError: If target_duration is not positive.
        """
        if target_duration <= 0:
            raise ValueError("Target duration must be positive")

        current_duration = self.metadata.duration_seconds

        if current_duration > target_duration:
            return self.slice(0, target_duration)
        elif current_duration < target_duration:
            silence = Audio.create_silent(
                target_duration - current_duration,
                stereo=self.metadata.channels == 2,
                sample_rate=self.metadata.sample_rate,
                sample_width=self.metadata.sample_width,
            )
            return self.concat(silence)
        return self

    def time_stretch(self, speed: float) -> Audio:
        """
        Time-stretch audio by a speed factor (pitch-preserving).

        Uses ffmpeg's atempo filter for high-quality time stretching.
        For speeds outside the 0.5-2.0 range, multiple atempo filters are chained.

        Args:
            speed: Speed multiplier. 2.0 = twice as fast (half duration),
                   0.5 = half speed (double duration).

        Returns:
            Audio: New Audio object with time-stretched audio.

        Raises:
            ValueError: If speed is not positive.
            AudioLoadError: If ffmpeg fails.
        """
        if speed <= 0:
            raise ValueError("Speed must be positive")

        if abs(speed - 1.0) < 0.001:
            # No change needed
            return Audio(self.data.copy(), self.metadata)

        # Build atempo filter string, chaining for extreme speeds
        # atempo only supports range [0.5, 2.0], so we chain multiple filters
        filters = []
        remaining_speed = speed

        while remaining_speed > 2.0:
            filters.append("atempo=2.0")
            remaining_speed /= 2.0
        while remaining_speed < 0.5:
            filters.append("atempo=0.5")
            remaining_speed /= 0.5

        if abs(remaining_speed - 1.0) > 0.001:
            filters.append(f"atempo={remaining_speed}")

        filter_str = ",".join(filters) if filters else "anull"

        # Save current audio to temp WAV, process with ffmpeg, read back
        import tempfile

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as input_file:
            input_path = input_file.name

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
            output_path = output_file.name

        try:
            # Save current audio to temp file
            self.save(input_path, format="wav")

            # Run ffmpeg with atempo filter
            cmd = [
                "ffmpeg",
                "-y",
                "-i",
                input_path,
                "-af",
                filter_str,
                "-ar",
                str(self.metadata.sample_rate),
                "-ac",
                str(self.metadata.channels),
                output_path,
            ]

            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            _, stderr = process.communicate()

            if process.returncode != 0:
                raise AudioLoadError(f"FFmpeg time stretch failed: {stderr.decode()}")

            # Load the result
            result = Audio.from_path(output_path)
            return result

        finally:
            # Clean up temp files
            import os

            for path in [input_path, output_path]:
                try:
                    os.unlink(path)
                except OSError:
                    pass

    def overlay(self, other: Audio, position: float = 0.0) -> Audio:
        """
        Overlay another audio segment on top of this one, mixing both signals.
        If mixing mono and stereo, converts mono to stereo.

        Args:
            other: Another Audio object to overlay
            position: Start position in seconds for the overlay (default: 0.0)

        Returns:
            Audio: New Audio object with mixed audio

        Raises:
            ValueError: If audio metadata doesn't match or position is invalid
        """
        if abs(self.metadata.sample_rate - other.metadata.sample_rate) > 0:
            raise ValueError("Sample rates must match")
        if self.metadata.sample_width != other.metadata.sample_width:
            raise ValueError("Sample widths must match")
        if position < 0:
            raise ValueError("Position cannot be negative")

        # Determine output format (mono or stereo)
        output_stereo = self.metadata.channels == 2 or other.metadata.channels == 2

        # Convert to appropriate format if needed
        base = self._to_stereo() if output_stereo and self.metadata.channels == 1 else self
        overlay_audio = other._to_stereo() if output_stereo and other.metadata.channels == 1 else other

        if base.metadata.channels != overlay_audio.metadata.channels:
            raise ValueError("Channel counts must match")

        # Convert position to samples, using ceil to ensure we don't cut off any audio
        position_samples = int(np.ceil(position * base.metadata.sample_rate))

        # Calculate the total length needed for the output
        total_length = max(len(base.data), position_samples + len(overlay_audio.data))

        # Create output array with appropriate shape
        if base.metadata.channels == 1:
            output = np.zeros(total_length, dtype=np.float32)
        else:
            output = np.zeros((total_length, 2), dtype=np.float32)  # type: ignore

        # Copy base audio
        output[: len(base.data)] = base.data

        # Add overlay audio at the specified position
        overlay_end = position_samples + len(overlay_audio.data)
        output[position_samples:overlay_end] += overlay_audio.data

        # Prevent clipping by scaling if necessary
        max_amplitude = np.max(np.abs(output))
        if max_amplitude > 1.0:
            output = output / max_amplitude

        # Create new metadata, using actual duration calculation
        new_duration = max(base.metadata.duration_seconds, position + overlay_audio.metadata.duration_seconds)
        new_metadata = AudioMetadata(
            sample_rate=base.metadata.sample_rate,
            channels=base.metadata.channels,
            sample_width=base.metadata.sample_width,
            duration_seconds=new_duration,
            frame_count=total_length,
        )

        return Audio(output, new_metadata)

    def save(self, file_path: str | Path, format: str | None = None) -> None:
        """
        Save audio to a file using ffmpeg

        Args:
            file_path: Path to save the audio file
            format: Output format (e.g., 'mp3', 'wav'). If None, inferred from extension.
        """
        file_path = Path(file_path)

        # Convert data back to int16
        int_data = (self.data * np.iinfo(np.int16).max).astype(np.int16)

        # Create WAV in memory
        wav_io = io.BytesIO()
        with wave.open(wav_io, "wb") as wav_file:
            wav_file.setnchannels(self.metadata.channels)
            wav_file.setsampwidth(self.metadata.sample_width)
            wav_file.setframerate(self.metadata.sample_rate)
            wav_file.writeframes(int_data.tobytes())

        wav_io.seek(0)

        # Check and infer format
        if format is None:
            format = file_path.suffix[1:]  # Remove the dot

        # Validate format
        SUPPORTED_FORMATS = {"mp3", "wav", "ogg", "flac"}
        if format not in SUPPORTED_FORMATS:
            raise ValueError(f"Unsupported format: {format}. Supported formats are: {', '.join(SUPPORTED_FORMATS)}")

        # Build ffmpeg command
        cmd = [
            "ffmpeg",
            "-y",  # Overwrite output file
            "-f",
            "wav",  # Input format
            "-i",
            "-",  # Read from stdin
        ]

        if format:
            cmd.extend(["-f", format])

        cmd.append(str(file_path))

        try:
            process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
            _, stderr = process.communicate(wav_io.getvalue())

            if process.returncode != 0:
                raise AudioLoadError(f"Error saving audio: {stderr.decode()}")

        except subprocess.CalledProcessError as e:
            raise AudioLoadError(f"Error running ffmpeg: {e}")

    def __len__(self) -> int:
        """Returns the number of samples"""
        return self.metadata.frame_count

    def __repr__(self) -> str:
        """String representation of the Audio object"""
        return (
            f"Audio(channels={self.metadata.channels}, "
            f"sample_rate={self.metadata.sample_rate}Hz, "
            f"duration={self.metadata.duration_seconds:.2f}s)"
        )

    # -------------------------------------------------------------------------
    # Audio Analysis Methods
    # -------------------------------------------------------------------------

    def get_levels(
        self,
        start_seconds: float = 0.0,
        end_seconds: float | None = None,
    ) -> "AudioLevels":
        """Calculate audio levels for a segment.

        Args:
            start_seconds: Start time in seconds (default: 0.0)
            end_seconds: End time in seconds (default: None, meaning end of audio)

        Returns:
            AudioLevels with RMS, peak, and dB measurements

        Example:
            >>> audio = Audio.from_path("audio.mp3")
            >>> levels = audio.get_levels()
            >>> print(f"Peak: {levels.db_peak:.1f} dB")
        """
        from videopython.base.audio.analysis import AudioLevels

        segment = self.slice(start_seconds, end_seconds)
        data = segment.data.flatten() if segment.metadata.channels == 2 else segment.data

        rms = float(np.sqrt(np.mean(data**2)))
        peak = float(np.max(np.abs(data)))

        # Convert to dB (avoid log of zero)
        db_rms = 20 * np.log10(max(rms, 1e-10))
        db_peak = 20 * np.log10(max(peak, 1e-10))

        return AudioLevels(rms=rms, peak=peak, db_rms=float(db_rms), db_peak=float(db_peak))

    def get_levels_over_time(
        self,
        window_seconds: float = 0.1,
        hop_seconds: float | None = None,
    ) -> list[tuple[float, "AudioLevels"]]:
        """Calculate audio levels over time using a sliding window.

        Args:
            window_seconds: Window size in seconds (default: 0.1)
            hop_seconds: Hop size in seconds (default: window_seconds / 2)

        Returns:
            List of (timestamp, AudioLevels) tuples where timestamp is the
            center of each window

        Example:
            >>> levels_over_time = audio.get_levels_over_time(window_seconds=0.1)
            >>> for timestamp, levels in levels_over_time:
            ...     print(f"{timestamp:.2f}s: {levels.db_rms:.1f} dB")
        """
        if hop_seconds is None:
            hop_seconds = window_seconds / 2

        results = []
        current_time = 0.0

        while current_time + window_seconds <= self.metadata.duration_seconds:
            levels = self.get_levels(current_time, current_time + window_seconds)
            results.append((current_time + window_seconds / 2, levels))
            current_time += hop_seconds

        return results

    def detect_silence(
        self,
        threshold_db: float = -40.0,
        min_duration: float = 0.5,
        window_seconds: float = 0.1,
    ) -> list["SilentSegment"]:
        """Detect silent segments in the audio.

        Args:
            threshold_db: RMS level below which audio is considered silent (default: -40 dB)
            min_duration: Minimum duration for a segment to be classified as silent (default: 0.5s)
            window_seconds: Window size for level analysis (default: 0.1s)

        Returns:
            List of SilentSegment objects representing detected silent regions

        Example:
            >>> silent_segments = audio.detect_silence(threshold_db=-40.0, min_duration=0.5)
            >>> for seg in silent_segments:
            ...     print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s")
        """
        from videopython.base.audio.analysis import SilentSegment

        levels_over_time = self.get_levels_over_time(window_seconds=window_seconds, hop_seconds=window_seconds / 2)

        silent_segments = []
        in_silence = False
        silence_start = 0.0
        silence_levels: list[float] = []

        for timestamp, levels in levels_over_time:
            is_silent = levels.db_rms < threshold_db

            if is_silent and not in_silence:
                # Start of silence
                in_silence = True
                silence_start = timestamp - window_seconds / 2
                silence_levels = [levels.rms]
            elif is_silent and in_silence:
                # Continue silence
                silence_levels.append(levels.rms)
            elif not is_silent and in_silence:
                # End of silence
                silence_end = timestamp - window_seconds / 2
                duration = silence_end - silence_start

                if duration >= min_duration:
                    avg_level = sum(silence_levels) / len(silence_levels)
                    silent_segments.append(
                        SilentSegment(
                            start=silence_start,
                            end=silence_end,
                            duration=duration,
                            avg_level=avg_level,
                        )
                    )

                in_silence = False
                silence_levels = []

        # Handle case where audio ends in silence
        if in_silence:
            silence_end = self.metadata.duration_seconds
            duration = silence_end - silence_start
            if duration >= min_duration:
                avg_level = sum(silence_levels) / len(silence_levels) if silence_levels else 0.0
                silent_segments.append(
                    SilentSegment(
                        start=silence_start,
                        end=silence_end,
                        duration=duration,
                        avg_level=avg_level,
                    )
                )

        return silent_segments

    def classify_segments(
        self,
        segment_length: float = 2.0,
        overlap: float = 0.5,
    ) -> list["AudioSegment"]:
        """Classify audio segments as speech, music, noise, or silence.

        This uses basic signal processing heuristics (no ML):
        - Zero-crossing rate (higher for speech/noise)
        - Spectral flatness (higher for noise, lower for music)
        - Energy distribution across frequency bands

        Args:
            segment_length: Length of each segment to classify in seconds (default: 2.0)
            overlap: Overlap between segments as fraction (default: 0.5)

        Returns:
            List of AudioSegment objects with classifications

        Example:
            >>> segments = audio.classify_segments(segment_length=2.0)
            >>> for seg in segments:
            ...     print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value}")
        """
        from videopython.base.audio.analysis import AudioSegment

        hop_length = segment_length * (1 - overlap)
        segments = []
        current_time = 0.0

        while current_time + segment_length <= self.metadata.duration_seconds:
            segment_audio = self.slice(current_time, current_time + segment_length)
            segment_type, confidence = self._classify_segment(segment_audio)
            levels = self.get_levels(current_time, current_time + segment_length)

            segments.append(
                AudioSegment(
                    start=current_time,
                    end=current_time + segment_length,
                    segment_type=segment_type,
                    confidence=confidence,
                    levels=levels,
                )
            )

            current_time += hop_length

        return segments

    def _classify_segment(self, segment: "Audio") -> tuple["AudioSegmentType", float]:
        """Classify a single audio segment using heuristics.

        Uses zero-crossing rate, spectral flatness, and speech band energy
        to make a best-effort classification without ML.

        Args:
            segment: Audio segment to classify

        Returns:
            Tuple of (AudioSegmentType, confidence)
        """
        from videopython.base.audio.analysis import AudioSegmentType

        data = segment.to_mono().data

        # Calculate RMS
        rms = np.sqrt(np.mean(data**2))

        # Check for silence first
        if rms < 0.01:  # Very quiet
            return AudioSegmentType.SILENCE, 0.95

        # Zero-crossing rate (normalized)
        zero_crossings = np.sum(np.abs(np.diff(np.sign(data)))) / 2
        zcr = zero_crossings / len(data)

        # Spectral analysis using FFT
        fft = np.abs(np.fft.rfft(data))
        freqs = np.fft.rfftfreq(len(data), 1.0 / segment.metadata.sample_rate)

        # Spectral flatness (geometric mean / arithmetic mean)
        # Higher values indicate noise-like signal
        log_fft = np.log(fft + 1e-10)
        geometric_mean = np.exp(np.mean(log_fft))
        arithmetic_mean = np.mean(fft)
        spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)

        # Energy in speech frequency range (300-3400 Hz)
        speech_mask = (freqs >= 300) & (freqs <= 3400)
        speech_energy = np.sum(fft[speech_mask] ** 2)
        total_energy = np.sum(fft**2) + 1e-10
        speech_ratio = speech_energy / total_energy

        # Spectral centroid (where is the "center of mass" of the spectrum)
        spectral_centroid = np.sum(freqs * fft) / (np.sum(fft) + 1e-10)

        # Heuristic classification
        # Noise: high spectral flatness, high ZCR
        if spectral_flatness > 0.5 and zcr > 0.1:
            return AudioSegmentType.NOISE, min(0.7, float(spectral_flatness))

        # Speech: medium ZCR, energy concentrated in 300-3400 Hz
        if speech_ratio > 0.4 and 0.02 < zcr < 0.15:
            return AudioSegmentType.SPEECH, min(0.8, float(speech_ratio))

        # Music: lower ZCR, lower spectral flatness, broader spectrum
        if spectral_flatness < 0.3 and spectral_centroid < 2000:
            return AudioSegmentType.MUSIC, min(0.7, 1 - float(spectral_flatness))

        # Default to speech with lower confidence
        return AudioSegmentType.SPEECH, 0.4

    def normalize(
        self,
        target_db: float = -3.0,
        method: str = "peak",
    ) -> "Audio":
        """Normalize audio to a target level.

        Args:
            target_db: Target level in dB (default: -3.0 dB, allows headroom)
            method: Normalization method, either "peak" or "rms" (default: "peak")

        Returns:
            New Audio object with normalized levels

        Example:
            >>> normalized = audio.normalize(target_db=-3.0, method="peak")
            >>> print(f"New peak: {normalized.get_levels().db_peak:.1f} dB")
        """
        data = self.data.copy()

        if method == "peak":
            current_peak = np.max(np.abs(data))
            if current_peak < 1e-10:
                return Audio(data, self.metadata)

            target_amplitude = 10 ** (target_db / 20)
            scale_factor = target_amplitude / current_peak

        elif method == "rms":
            current_rms = np.sqrt(np.mean(data**2))
            if current_rms < 1e-10:
                return Audio(data, self.metadata)

            target_rms = 10 ** (target_db / 20)
            scale_factor = target_rms / current_rms

        else:
            raise ValueError(f"Unknown method: {method}. Use 'peak' or 'rms'")

        # Apply scaling
        normalized_data = data * scale_factor

        # Clip to prevent overflow (should be rare with proper target_db)
        normalized_data = np.clip(normalized_data, -1.0, 1.0)

        return Audio(normalized_data.astype(np.float32), self.metadata)

is_silent `property`

is_silent: bool

Check if the audio track is silent (all samples are effectively zero)

Returns:

Name	Type	Description
`bool`	`bool`	True if the audio is silent, False otherwise

init

__init__(data: ndarray, metadata: AudioMetadata)

Initialize Audio object

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Audio data as numpy array, normalized between -1 and 1	required
`metadata`	`AudioMetadata`	AudioMetadata object containing audio properties	required

Source code in src/videopython/base/audio/audio.py

def __init__(self, data: np.ndarray, metadata: AudioMetadata):
    """
    Initialize Audio object

    Args:
        data: Audio data as numpy array, normalized between -1 and 1
        metadata: AudioMetadata object containing audio properties
    """
    self.data = data
    self.metadata = metadata

create_silent `classmethod`

create_silent(
    duration_seconds: float,
    stereo: bool = True,
    sample_rate: int = 44100,
    sample_width: int = 2,
) -> Audio

Create a silent audio track.

Parameters:

Name	Type	Description	Default
`duration_seconds`	`float`	Length of the silent track in seconds	required
`stereo`	`bool`	If True, create stereo track; if False, create mono track (default: True)	`True`
`sample_rate`	`int`	Sample rate in Hz (default: 44100)	`44100`
`sample_width`	`int`	Sample width in bytes (default: 2, which is 16-bit)	`2`

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance with silent track

Raises:

Type	Description
`ValueError`	If duration is negative or other parameters are invalid

Source code in src/videopython/base/audio/audio.py

@classmethod
def create_silent(
    cls, duration_seconds: float, stereo: bool = True, sample_rate: int = 44100, sample_width: int = 2
) -> Audio:
    """
    Create a silent audio track.

    Args:
        duration_seconds: Length of the silent track in seconds
        stereo: If True, create stereo track; if False, create mono track (default: True)
        sample_rate: Sample rate in Hz (default: 44100)
        sample_width: Sample width in bytes (default: 2, which is 16-bit)

    Returns:
        Audio: New Audio instance with silent track

    Raises:
        ValueError: If duration is negative or other parameters are invalid
    """
    if duration_seconds <= 0:
        raise ValueError("Duration must be positive")
    if sample_rate <= 0:
        raise ValueError("Sample rate must be positive")
    if sample_width not in {1, 2, 4}:
        raise ValueError("Sample width must be 1, 2, or 4 bytes")

    # Calculate number of frames
    frame_count = int(duration_seconds * sample_rate)

    # Create silent data array
    channels = 2 if stereo else 1
    shape = (frame_count, channels) if stereo else (frame_count,)
    data = np.zeros(shape, dtype=np.float32)

    # Create metadata
    metadata = AudioMetadata(
        sample_rate=sample_rate,
        channels=channels,
        sample_width=sample_width,
        duration_seconds=duration_seconds,
        frame_count=frame_count,
    )

    return cls(data, metadata)

from_path `classmethod`

from_path(file_path: str | Path) -> Audio

Load audio from a file using ffmpeg

Parameters:

Name	Type	Description	Default
`file_path`	`str \| Path`	Path to the audio file	required

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance

Raises:

Type	Description
`FileNotFoundError`	If the file doesn't exist
`AudioLoadError`	If there's an error loading the audio

Source code in src/videopython/base/audio/audio.py

@classmethod
def from_path(cls, file_path: str | Path) -> Audio:
    """
    Load audio from a file using ffmpeg

    Args:
        file_path: Path to the audio file

    Returns:
        Audio: New Audio instance

    Raises:
        FileNotFoundError: If the file doesn't exist
        AudioLoadError: If there's an error loading the audio
    """
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    # Get audio info
    info = cls._get_ffmpeg_info(file_path)

    # Convert to WAV using ffmpeg
    cmd = [
        "ffmpeg",
        "-i",
        str(file_path),
        "-f",
        "wav",
        "-ar",
        str(info["sample_rate"]),  # sample rate
        "-ac",
        str(info["channels"]),  # channels
        "-bits_per_raw_sample",
        str(info["bit_depth"]),
        "-",  # Output to stdout
    ]

    try:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        wav_data, stderr = process.communicate()

        if process.returncode != 0:
            raise AudioLoadError(f"FFmpeg error: {stderr.decode()}")

        # Read WAV data
        with io.BytesIO(wav_data) as wav_io:
            with wave.open(wav_io, "rb") as wav_file:
                # Get WAV metadata
                sample_width = wav_file.getsampwidth()
                channels = wav_file.getnchannels()
                sample_rate = wav_file.getframerate()
                n_frames = wav_file.getnframes()

                # Read raw audio data
                raw_data = wav_file.readframes(n_frames)

                # Convert bytes to numpy array based on sample width
                dtype_map = {1: np.int8, 2: np.int16, 4: np.int32}
                dtype = dtype_map.get(sample_width)
                if dtype is None:
                    raise AudioLoadError(f"Unsupported sample width: {sample_width}")

                data = np.frombuffer(raw_data, dtype=dtype)

                # Reshape if stereo
                if channels == 2:
                    data = data.reshape(-1, 2)

                # Convert to float32
                data = data.astype(np.float32)

                # Reshape before normalization if stereo
                if channels == 2:
                    data = data.reshape(-1, 2)

                # Normalize to float between -1 and 1
                max_value = float(np.iinfo(dtype).max)  # type: ignore
                data = data / max_value

                # Ensure normalization is within bounds due to floating point precision
                data = np.clip(data, -1.0, 1.0)

                # Calculate frame count from actual data length
                # For stereo, len(data) is already correct after reshape
                frame_count = len(data)

                metadata = AudioMetadata(
                    sample_rate=sample_rate,
                    channels=channels,
                    sample_width=sample_width,
                    duration_seconds=info["duration"],
                    frame_count=frame_count,
                )

                return cls(data, metadata)

    except subprocess.CalledProcessError as e:
        raise AudioLoadError(f"Error running ffmpeg: {e}")

from_file `classmethod`

from_file(file_path: str | Path) -> Audio

Deprecated: Use from_path() instead.

Source code in src/videopython/base/audio/audio.py

@classmethod
def from_file(cls, file_path: str | Path) -> Audio:
    """Deprecated: Use from_path() instead."""
    import warnings

    warnings.warn(
        "Audio.from_file() is deprecated, use Audio.from_path() instead",
        DeprecationWarning,
        stacklevel=2,
    )
    return cls.from_path(file_path)

silence `classmethod`

silence(
    duration: float,
    sample_rate: int = 44100,
    channels: int = 2,
) -> Audio

Create a silent audio track.

Parameters:

Name	Type	Description	Default
`duration`	`float`	Duration in seconds.	required
`sample_rate`	`int`	Sample rate in Hz. Default: 44100.	`44100`
`channels`	`int`	Number of channels (1 for mono, 2 for stereo). Default: 2.	`2`

Returns:

Name	Type	Description
`Audio`	`Audio`	Silent audio track with the specified parameters.

Example

silence = Audio.silence(duration=5.0) # 5 seconds of silence silence = Audio.silence(duration=2.0, sample_rate=22050, channels=1)

Source code in src/videopython/base/audio/audio.py

@classmethod
def silence(
    cls,
    duration: float,
    sample_rate: int = 44100,
    channels: int = 2,
) -> Audio:
    """Create a silent audio track.

    Args:
        duration: Duration in seconds.
        sample_rate: Sample rate in Hz. Default: 44100.
        channels: Number of channels (1 for mono, 2 for stereo). Default: 2.

    Returns:
        Audio: Silent audio track with the specified parameters.

    Example:
        >>> silence = Audio.silence(duration=5.0)  # 5 seconds of silence
        >>> silence = Audio.silence(duration=2.0, sample_rate=22050, channels=1)
    """
    frame_count = int(duration * sample_rate)
    data = np.zeros((frame_count, channels), dtype=np.float32)

    metadata = AudioMetadata(
        sample_rate=sample_rate,
        channels=channels,
        sample_width=2,
        duration_seconds=duration,
        frame_count=frame_count,
    )

    return cls(data, metadata)

to_mono

to_mono() -> Audio

Convert stereo audio to mono by averaging channels

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance with mono audio

Source code in src/videopython/base/audio/audio.py

def to_mono(self) -> Audio:
    """
    Convert stereo audio to mono by averaging channels

    Returns:
        Audio: New Audio instance with mono audio
    """
    if self.metadata.channels == 1:
        return self

    mono_data = self.data.mean(axis=1)

    new_metadata = AudioMetadata(
        sample_rate=self.metadata.sample_rate,
        channels=1,
        sample_width=self.metadata.sample_width,
        duration_seconds=self.metadata.duration_seconds,
        frame_count=len(mono_data),
    )

    return Audio(mono_data, new_metadata)

get_channel

get_channel(channel: int) -> Audio

Extract a single channel from the audio

Parameters:

Name	Type	Description	Default
`channel`	`int`	Channel number (0 for left, 1 for right)	required

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance with single channel

Raises:

Type	Description
`ValueError`	If channel number is invalid

Source code in src/videopython/base/audio/audio.py

def get_channel(self, channel: int) -> Audio:
    """
    Extract a single channel from the audio

    Args:
        channel: Channel number (0 for left, 1 for right)

    Returns:
        Audio: New Audio instance with single channel

    Raises:
        ValueError: If channel number is invalid
    """
    if self.metadata.channels == 1:
        return self

    if channel not in [0, 1]:
        raise ValueError("Channel must be 0 (left) or 1 (right)")

    channel_data = self.data[:, channel]

    new_metadata = AudioMetadata(
        sample_rate=self.metadata.sample_rate,
        channels=1,
        sample_width=self.metadata.sample_width,
        duration_seconds=self.metadata.duration_seconds,
        frame_count=len(channel_data),
    )

    return Audio(channel_data, new_metadata)

resample

resample(target_sample_rate: int) -> Audio

Resample the audio to a new sample rate

Parameters:

Name	Type	Description	Default
`target_sample_rate`	`int`	New sample rate in Hz	required

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance with resampled audio

Source code in src/videopython/base/audio/audio.py

def resample(self, target_sample_rate: int) -> Audio:
    """
    Resample the audio to a new sample rate

    Args:
        target_sample_rate: New sample rate in Hz

    Returns:
        Audio: New Audio instance with resampled audio
    """
    if target_sample_rate == self.metadata.sample_rate:
        return self

    # Calculate resampling ratio
    ratio = target_sample_rate / self.metadata.sample_rate

    target_length = round(self.data.shape[0] * ratio)

    audio_array = self.data
    if self.metadata.channels == 1:
        audio_array = audio_array.reshape(-1, 1)

    resampled_data = np.zeros((target_length, self.metadata.channels), dtype=np.float32)

    for channel in range(self.metadata.channels):
        resampled_data[:, channel] = self._resample_channel(audio_array[:, channel], target_length)

    new_metadata = AudioMetadata(
        sample_rate=target_sample_rate,
        channels=self.metadata.channels,
        sample_width=self.metadata.sample_width,
        duration_seconds=target_length / target_sample_rate,
        frame_count=target_length,
    )
    if self.metadata.channels == 1:
        resampled_data = resampled_data.flatten()

    return Audio(resampled_data, new_metadata)

concat

concat(other: Audio, crossfade: float = 0.0) -> Audio

Concatenate another audio segment to this one. If mixing mono and stereo, converts mono to stereo.

Parameters:

Name	Type	Description	Default
`other`	`Audio`	Another Audio object to concatenate	required
`crossfade`	`float`	Duration of crossfade in seconds (default: 0.0 for no crossfade)	`0.0`

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio object with concatenated data

Raises:

Type	Description
`ValueError`	If audio metadata doesn't match or crossfade duration is invalid

Source code in src/videopython/base/audio/audio.py

def concat(self, other: Audio, crossfade: float = 0.0) -> Audio:
    """
    Concatenate another audio segment to this one.
    If mixing mono and stereo, converts mono to stereo.

    Args:
        other: Another Audio object to concatenate
        crossfade: Duration of crossfade in seconds (default: 0.0 for no crossfade)

    Returns:
        Audio: New Audio object with concatenated data

    Raises:
        ValueError: If audio metadata doesn't match or crossfade duration is invalid
    """
    if abs(self.metadata.sample_rate - other.metadata.sample_rate) > 0:
        raise ValueError("Sample rates must match")
    if self.metadata.sample_width != other.metadata.sample_width:
        raise ValueError("Sample widths must match")

    # Determine output format (mono or stereo)
    output_stereo = self.metadata.channels == 2 or other.metadata.channels == 2

    # Convert to appropriate format if needed
    first = self._to_stereo() if output_stereo and self.metadata.channels == 1 else self
    second = other._to_stereo() if output_stereo and other.metadata.channels == 1 else other

    if first.metadata.channels != second.metadata.channels:
        raise ValueError("Channel counts must match")

    # Handle case with no crossfade
    if crossfade <= 0:
        if first.metadata.channels == 1:
            concatenated_data = np.concatenate([first.data, second.data])
        else:
            concatenated_data = np.vstack([first.data, second.data])

        new_metadata = AudioMetadata(
            sample_rate=first.metadata.sample_rate,
            channels=first.metadata.channels,
            sample_width=first.metadata.sample_width,
            duration_seconds=first.metadata.duration_seconds + second.metadata.duration_seconds,
            frame_count=len(concatenated_data),
        )

        return Audio(concatenated_data, new_metadata)

    # Validate crossfade duration
    if crossfade > min(first.metadata.duration_seconds, second.metadata.duration_seconds):
        raise ValueError("Crossfade duration cannot exceed duration of either audio segment")

    # Calculate crossfade parameters
    crossfade_samples = int(crossfade * first.metadata.sample_rate)

    # Calculate output length and create output array
    total_samples = len(first.data) + len(second.data) - crossfade_samples
    if first.metadata.channels == 1:
        output = np.zeros(total_samples, dtype=np.float32)
    else:
        output = np.zeros((total_samples, 2), dtype=np.float32)  # type: ignore

    # Copy non-crossfaded portions
    crossfade_start = len(first.data) - crossfade_samples
    output[:crossfade_start] = first.data[:crossfade_start]
    output[crossfade_start + crossfade_samples :] = second.data[crossfade_samples:]

    # Create crossfade ramps
    fade_out = np.linspace(1, 0, crossfade_samples)
    fade_in = np.linspace(0, 1, crossfade_samples)

    # Apply crossfade
    if first.metadata.channels == 1:
        output[crossfade_start : crossfade_start + crossfade_samples] = (
            first.data[crossfade_start:] * fade_out + second.data[:crossfade_samples] * fade_in
        )
    else:
        for channel in range(first.metadata.channels):
            output[crossfade_start : crossfade_start + crossfade_samples, channel] = (
                first.data[crossfade_start:, channel] * fade_out
                + second.data[:crossfade_samples, channel] * fade_in
            )

    # Create new metadata
    new_duration = total_samples / first.metadata.sample_rate
    new_metadata = AudioMetadata(
        sample_rate=first.metadata.sample_rate,
        channels=first.metadata.channels,
        sample_width=first.metadata.sample_width,
        duration_seconds=new_duration,
        frame_count=total_samples,
    )

    return Audio(output, new_metadata)

slice

slice(
    start_seconds: float = 0.0,
    end_seconds: float | None = None,
) -> Audio

Extract a portion of the audio between start_seconds and end_seconds.

Parameters:

Name	Type	Description	Default
`start_seconds`	`float`	Start time in seconds (default: 0.0)	`0.0`
`end_seconds`	`float \| None`	End time in seconds (default: None, meaning end of audio)	`None`

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio instance with the extracted portion

Raises:

Type	Description
`ValueError`	If start_seconds or end_seconds are invalid

Source code in src/videopython/base/audio/audio.py

def slice(self, start_seconds: float = 0.0, end_seconds: float | None = None) -> Audio:
    """
    Extract a portion of the audio between start_seconds and end_seconds.

    Args:
        start_seconds: Start time in seconds (default: 0.0)
        end_seconds: End time in seconds (default: None, meaning end of audio)

    Returns:
        Audio: New Audio instance with the extracted portion

    Raises:
        ValueError: If start_seconds or end_seconds are invalid
    """
    # Validate inputs
    if start_seconds < 0:
        raise ValueError("start_seconds must be non-negative")

    duration_seconds = self.metadata.duration_seconds
    duration_tolerance = 1e-6

    if end_seconds is not None:
        if end_seconds < start_seconds:
            raise ValueError("end_seconds must be greater than start_seconds")
        if end_seconds > duration_seconds + duration_tolerance:
            raise ValueError("end_seconds cannot exceed audio duration")
        end_seconds = min(end_seconds, duration_seconds)
    else:
        end_seconds = duration_seconds

    # Convert seconds to sample indices
    start_idx = int(start_seconds * self.metadata.sample_rate)
    end_idx = int(end_seconds * self.metadata.sample_rate)

    # Extract the portion of audio data
    sliced_data = self.data[start_idx:end_idx]

    # Calculate new duration
    new_duration = (end_idx - start_idx) / self.metadata.sample_rate

    # Create new metadata
    new_metadata = AudioMetadata(
        sample_rate=self.metadata.sample_rate,
        channels=self.metadata.channels,
        sample_width=self.metadata.sample_width,
        duration_seconds=new_duration,
        frame_count=len(sliced_data) if self.metadata.channels == 1 else len(sliced_data),
    )

    return Audio(sliced_data, new_metadata)

scale_volume

scale_volume(factor: float) -> Audio

Scale audio volume by a factor.

Parameters:

Name	Type	Description	Default
`factor`	`float`	Volume multiplier. 1.0 = no change, 0.5 = half volume, 2.0 = double volume (may clip).	required

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio object with scaled volume.

Raises:

Type	Description
`ValueError`	If factor is negative.

Source code in src/videopython/base/audio/audio.py

def scale_volume(self, factor: float) -> Audio:
    """
    Scale audio volume by a factor.

    Args:
        factor: Volume multiplier. 1.0 = no change, 0.5 = half volume,
                2.0 = double volume (may clip).

    Returns:
        Audio: New Audio object with scaled volume.

    Raises:
        ValueError: If factor is negative.
    """
    if factor < 0:
        raise ValueError("Volume factor must be non-negative")

    scaled_data = self.data * factor
    # Clip to prevent overflow
    scaled_data = np.clip(scaled_data, -1.0, 1.0)

    return Audio(scaled_data.astype(np.float32), self.metadata)

fit_to_duration

fit_to_duration(target_duration: float) -> 'Audio'

Adjust audio duration to match a target, slicing or padding with silence.

If audio is longer than target, it will be sliced. If audio is shorter than target, silence will be appended.

Parameters:

Name	Type	Description	Default
`target_duration`	`float`	Target duration in seconds.	required

Returns:

Name	Type	Description
`Audio`	`'Audio'`	New Audio object with the target duration.

Raises:

Type	Description
`ValueError`	If target_duration is not positive.

Source code in src/videopython/base/audio/audio.py

def fit_to_duration(self, target_duration: float) -> "Audio":
    """
    Adjust audio duration to match a target, slicing or padding with silence.

    If audio is longer than target, it will be sliced.
    If audio is shorter than target, silence will be appended.

    Args:
        target_duration: Target duration in seconds.

    Returns:
        Audio: New Audio object with the target duration.

    Raises:
        ValueError: If target_duration is not positive.
    """
    if target_duration <= 0:
        raise ValueError("Target duration must be positive")

    current_duration = self.metadata.duration_seconds

    if current_duration > target_duration:
        return self.slice(0, target_duration)
    elif current_duration < target_duration:
        silence = Audio.create_silent(
            target_duration - current_duration,
            stereo=self.metadata.channels == 2,
            sample_rate=self.metadata.sample_rate,
            sample_width=self.metadata.sample_width,
        )
        return self.concat(silence)
    return self

time_stretch

time_stretch(speed: float) -> Audio

Time-stretch audio by a speed factor (pitch-preserving).

Uses ffmpeg's atempo filter for high-quality time stretching. For speeds outside the 0.5-2.0 range, multiple atempo filters are chained.

Parameters:

Name	Type	Description	Default
`speed`	`float`	Speed multiplier. 2.0 = twice as fast (half duration), 0.5 = half speed (double duration).	required

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio object with time-stretched audio.

Raises:

Type	Description
`ValueError`	If speed is not positive.
`AudioLoadError`	If ffmpeg fails.

Source code in src/videopython/base/audio/audio.py

def time_stretch(self, speed: float) -> Audio:
    """
    Time-stretch audio by a speed factor (pitch-preserving).

    Uses ffmpeg's atempo filter for high-quality time stretching.
    For speeds outside the 0.5-2.0 range, multiple atempo filters are chained.

    Args:
        speed: Speed multiplier. 2.0 = twice as fast (half duration),
               0.5 = half speed (double duration).

    Returns:
        Audio: New Audio object with time-stretched audio.

    Raises:
        ValueError: If speed is not positive.
        AudioLoadError: If ffmpeg fails.
    """
    if speed <= 0:
        raise ValueError("Speed must be positive")

    if abs(speed - 1.0) < 0.001:
        # No change needed
        return Audio(self.data.copy(), self.metadata)

    # Build atempo filter string, chaining for extreme speeds
    # atempo only supports range [0.5, 2.0], so we chain multiple filters
    filters = []
    remaining_speed = speed

    while remaining_speed > 2.0:
        filters.append("atempo=2.0")
        remaining_speed /= 2.0
    while remaining_speed < 0.5:
        filters.append("atempo=0.5")
        remaining_speed /= 0.5

    if abs(remaining_speed - 1.0) > 0.001:
        filters.append(f"atempo={remaining_speed}")

    filter_str = ",".join(filters) if filters else "anull"

    # Save current audio to temp WAV, process with ffmpeg, read back
    import tempfile

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as input_file:
        input_path = input_file.name

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
        output_path = output_file.name

    try:
        # Save current audio to temp file
        self.save(input_path, format="wav")

        # Run ffmpeg with atempo filter
        cmd = [
            "ffmpeg",
            "-y",
            "-i",
            input_path,
            "-af",
            filter_str,
            "-ar",
            str(self.metadata.sample_rate),
            "-ac",
            str(self.metadata.channels),
            output_path,
        ]

        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        _, stderr = process.communicate()

        if process.returncode != 0:
            raise AudioLoadError(f"FFmpeg time stretch failed: {stderr.decode()}")

        # Load the result
        result = Audio.from_path(output_path)
        return result

    finally:
        # Clean up temp files
        import os

        for path in [input_path, output_path]:
            try:
                os.unlink(path)
            except OSError:
                pass

overlay

overlay(other: Audio, position: float = 0.0) -> Audio

Overlay another audio segment on top of this one, mixing both signals. If mixing mono and stereo, converts mono to stereo.

Parameters:

Name	Type	Description	Default
`other`	`Audio`	Another Audio object to overlay	required
`position`	`float`	Start position in seconds for the overlay (default: 0.0)	`0.0`

Returns:

Name	Type	Description
`Audio`	`Audio`	New Audio object with mixed audio

Raises:

Type	Description
`ValueError`	If audio metadata doesn't match or position is invalid

Source code in src/videopython/base/audio/audio.py

def overlay(self, other: Audio, position: float = 0.0) -> Audio:
    """
    Overlay another audio segment on top of this one, mixing both signals.
    If mixing mono and stereo, converts mono to stereo.

    Args:
        other: Another Audio object to overlay
        position: Start position in seconds for the overlay (default: 0.0)

    Returns:
        Audio: New Audio object with mixed audio

    Raises:
        ValueError: If audio metadata doesn't match or position is invalid
    """
    if abs(self.metadata.sample_rate - other.metadata.sample_rate) > 0:
        raise ValueError("Sample rates must match")
    if self.metadata.sample_width != other.metadata.sample_width:
        raise ValueError("Sample widths must match")
    if position < 0:
        raise ValueError("Position cannot be negative")

    # Determine output format (mono or stereo)
    output_stereo = self.metadata.channels == 2 or other.metadata.channels == 2

    # Convert to appropriate format if needed
    base = self._to_stereo() if output_stereo and self.metadata.channels == 1 else self
    overlay_audio = other._to_stereo() if output_stereo and other.metadata.channels == 1 else other

    if base.metadata.channels != overlay_audio.metadata.channels:
        raise ValueError("Channel counts must match")

    # Convert position to samples, using ceil to ensure we don't cut off any audio
    position_samples = int(np.ceil(position * base.metadata.sample_rate))

    # Calculate the total length needed for the output
    total_length = max(len(base.data), position_samples + len(overlay_audio.data))

    # Create output array with appropriate shape
    if base.metadata.channels == 1:
        output = np.zeros(total_length, dtype=np.float32)
    else:
        output = np.zeros((total_length, 2), dtype=np.float32)  # type: ignore

    # Copy base audio
    output[: len(base.data)] = base.data

    # Add overlay audio at the specified position
    overlay_end = position_samples + len(overlay_audio.data)
    output[position_samples:overlay_end] += overlay_audio.data

    # Prevent clipping by scaling if necessary
    max_amplitude = np.max(np.abs(output))
    if max_amplitude > 1.0:
        output = output / max_amplitude

    # Create new metadata, using actual duration calculation
    new_duration = max(base.metadata.duration_seconds, position + overlay_audio.metadata.duration_seconds)
    new_metadata = AudioMetadata(
        sample_rate=base.metadata.sample_rate,
        channels=base.metadata.channels,
        sample_width=base.metadata.sample_width,
        duration_seconds=new_duration,
        frame_count=total_length,
    )

    return Audio(output, new_metadata)

save

save(
    file_path: str | Path, format: str | None = None
) -> None

Save audio to a file using ffmpeg

Parameters:

Name	Type	Description	Default
`file_path`	`str \| Path`	Path to save the audio file	required
`format`	`str \| None`	Output format (e.g., 'mp3', 'wav'). If None, inferred from extension.	`None`

Source code in src/videopython/base/audio/audio.py

def save(self, file_path: str | Path, format: str | None = None) -> None:
    """
    Save audio to a file using ffmpeg

    Args:
        file_path: Path to save the audio file
        format: Output format (e.g., 'mp3', 'wav'). If None, inferred from extension.
    """
    file_path = Path(file_path)

    # Convert data back to int16
    int_data = (self.data * np.iinfo(np.int16).max).astype(np.int16)

    # Create WAV in memory
    wav_io = io.BytesIO()
    with wave.open(wav_io, "wb") as wav_file:
        wav_file.setnchannels(self.metadata.channels)
        wav_file.setsampwidth(self.metadata.sample_width)
        wav_file.setframerate(self.metadata.sample_rate)
        wav_file.writeframes(int_data.tobytes())

    wav_io.seek(0)

    # Check and infer format
    if format is None:
        format = file_path.suffix[1:]  # Remove the dot

    # Validate format
    SUPPORTED_FORMATS = {"mp3", "wav", "ogg", "flac"}
    if format not in SUPPORTED_FORMATS:
        raise ValueError(f"Unsupported format: {format}. Supported formats are: {', '.join(SUPPORTED_FORMATS)}")

    # Build ffmpeg command
    cmd = [
        "ffmpeg",
        "-y",  # Overwrite output file
        "-f",
        "wav",  # Input format
        "-i",
        "-",  # Read from stdin
    ]

    if format:
        cmd.extend(["-f", format])

    cmd.append(str(file_path))

    try:
        process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
        _, stderr = process.communicate(wav_io.getvalue())

        if process.returncode != 0:
            raise AudioLoadError(f"Error saving audio: {stderr.decode()}")

    except subprocess.CalledProcessError as e:
        raise AudioLoadError(f"Error running ffmpeg: {e}")

len

__len__() -> int

Returns the number of samples

Source code in src/videopython/base/audio/audio.py

def __len__(self) -> int:
    """Returns the number of samples"""
    return self.metadata.frame_count

repr

__repr__() -> str

String representation of the Audio object

Source code in src/videopython/base/audio/audio.py

def __repr__(self) -> str:
    """String representation of the Audio object"""
    return (
        f"Audio(channels={self.metadata.channels}, "
        f"sample_rate={self.metadata.sample_rate}Hz, "
        f"duration={self.metadata.duration_seconds:.2f}s)"
    )

get_levels

get_levels(
    start_seconds: float = 0.0,
    end_seconds: float | None = None,
) -> "AudioLevels"

Calculate audio levels for a segment.

Parameters:

Name	Type	Description	Default
`start_seconds`	`float`	Start time in seconds (default: 0.0)	`0.0`
`end_seconds`	`float \| None`	End time in seconds (default: None, meaning end of audio)	`None`

Returns:

Type	Description
`'AudioLevels'`	AudioLevels with RMS, peak, and dB measurements

Example

audio = Audio.from_path("audio.mp3") levels = audio.get_levels() print(f"Peak: {levels.db_peak:.1f} dB")

Source code in src/videopython/base/audio/audio.py

def get_levels(
    self,
    start_seconds: float = 0.0,
    end_seconds: float | None = None,
) -> "AudioLevels":
    """Calculate audio levels for a segment.

    Args:
        start_seconds: Start time in seconds (default: 0.0)
        end_seconds: End time in seconds (default: None, meaning end of audio)

    Returns:
        AudioLevels with RMS, peak, and dB measurements

    Example:
        >>> audio = Audio.from_path("audio.mp3")
        >>> levels = audio.get_levels()
        >>> print(f"Peak: {levels.db_peak:.1f} dB")
    """
    from videopython.base.audio.analysis import AudioLevels

    segment = self.slice(start_seconds, end_seconds)
    data = segment.data.flatten() if segment.metadata.channels == 2 else segment.data

    rms = float(np.sqrt(np.mean(data**2)))
    peak = float(np.max(np.abs(data)))

    # Convert to dB (avoid log of zero)
    db_rms = 20 * np.log10(max(rms, 1e-10))
    db_peak = 20 * np.log10(max(peak, 1e-10))

    return AudioLevels(rms=rms, peak=peak, db_rms=float(db_rms), db_peak=float(db_peak))

get_levels_over_time

get_levels_over_time(
    window_seconds: float = 0.1,
    hop_seconds: float | None = None,
) -> list[tuple[float, "AudioLevels"]]

Calculate audio levels over time using a sliding window.

Parameters:

Name	Type	Description	Default
`window_seconds`	`float`	Window size in seconds (default: 0.1)	`0.1`
`hop_seconds`	`float \| None`	Hop size in seconds (default: window_seconds / 2)	`None`

Returns:

Type	Description
`list[tuple[float, 'AudioLevels']]`	List of (timestamp, AudioLevels) tuples where timestamp is the
`list[tuple[float, 'AudioLevels']]`	center of each window

Example

levels_over_time = audio.get_levels_over_time(window_seconds=0.1) for timestamp, levels in levels_over_time: ... print(f"{timestamp:.2f}s: {levels.db_rms:.1f} dB")

Source code in src/videopython/base/audio/audio.py

def get_levels_over_time(
    self,
    window_seconds: float = 0.1,
    hop_seconds: float | None = None,
) -> list[tuple[float, "AudioLevels"]]:
    """Calculate audio levels over time using a sliding window.

    Args:
        window_seconds: Window size in seconds (default: 0.1)
        hop_seconds: Hop size in seconds (default: window_seconds / 2)

    Returns:
        List of (timestamp, AudioLevels) tuples where timestamp is the
        center of each window

    Example:
        >>> levels_over_time = audio.get_levels_over_time(window_seconds=0.1)
        >>> for timestamp, levels in levels_over_time:
        ...     print(f"{timestamp:.2f}s: {levels.db_rms:.1f} dB")
    """
    if hop_seconds is None:
        hop_seconds = window_seconds / 2

    results = []
    current_time = 0.0

    while current_time + window_seconds <= self.metadata.duration_seconds:
        levels = self.get_levels(current_time, current_time + window_seconds)
        results.append((current_time + window_seconds / 2, levels))
        current_time += hop_seconds

    return results

detect_silence

detect_silence(
    threshold_db: float = -40.0,
    min_duration: float = 0.5,
    window_seconds: float = 0.1,
) -> list["SilentSegment"]

Detect silent segments in the audio.

Parameters:

Name	Type	Description	Default
`threshold_db`	`float`	RMS level below which audio is considered silent (default: -40 dB)	`-40.0`
`min_duration`	`float`	Minimum duration for a segment to be classified as silent (default: 0.5s)	`0.5`
`window_seconds`	`float`	Window size for level analysis (default: 0.1s)	`0.1`

Returns:

Type	Description
`list['SilentSegment']`	List of SilentSegment objects representing detected silent regions

Example

silent_segments = audio.detect_silence(threshold_db=-40.0, min_duration=0.5) for seg in silent_segments: ... print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s")

Source code in src/videopython/base/audio/audio.py

def detect_silence(
    self,
    threshold_db: float = -40.0,
    min_duration: float = 0.5,
    window_seconds: float = 0.1,
) -> list["SilentSegment"]:
    """Detect silent segments in the audio.

    Args:
        threshold_db: RMS level below which audio is considered silent (default: -40 dB)
        min_duration: Minimum duration for a segment to be classified as silent (default: 0.5s)
        window_seconds: Window size for level analysis (default: 0.1s)

    Returns:
        List of SilentSegment objects representing detected silent regions

    Example:
        >>> silent_segments = audio.detect_silence(threshold_db=-40.0, min_duration=0.5)
        >>> for seg in silent_segments:
        ...     print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s")
    """
    from videopython.base.audio.analysis import SilentSegment

    levels_over_time = self.get_levels_over_time(window_seconds=window_seconds, hop_seconds=window_seconds / 2)

    silent_segments = []
    in_silence = False
    silence_start = 0.0
    silence_levels: list[float] = []

    for timestamp, levels in levels_over_time:
        is_silent = levels.db_rms < threshold_db

        if is_silent and not in_silence:
            # Start of silence
            in_silence = True
            silence_start = timestamp - window_seconds / 2
            silence_levels = [levels.rms]
        elif is_silent and in_silence:
            # Continue silence
            silence_levels.append(levels.rms)
        elif not is_silent and in_silence:
            # End of silence
            silence_end = timestamp - window_seconds / 2
            duration = silence_end - silence_start

            if duration >= min_duration:
                avg_level = sum(silence_levels) / len(silence_levels)
                silent_segments.append(
                    SilentSegment(
                        start=silence_start,
                        end=silence_end,
                        duration=duration,
                        avg_level=avg_level,
                    )
                )

            in_silence = False
            silence_levels = []

    # Handle case where audio ends in silence
    if in_silence:
        silence_end = self.metadata.duration_seconds
        duration = silence_end - silence_start
        if duration >= min_duration:
            avg_level = sum(silence_levels) / len(silence_levels) if silence_levels else 0.0
            silent_segments.append(
                SilentSegment(
                    start=silence_start,
                    end=silence_end,
                    duration=duration,
                    avg_level=avg_level,
                )
            )

    return silent_segments

classify_segments

classify_segments(
    segment_length: float = 2.0, overlap: float = 0.5
) -> list["AudioSegment"]

Classify audio segments as speech, music, noise, or silence.

This uses basic signal processing heuristics (no ML): - Zero-crossing rate (higher for speech/noise) - Spectral flatness (higher for noise, lower for music) - Energy distribution across frequency bands

Parameters:

Name	Type	Description	Default
`segment_length`	`float`	Length of each segment to classify in seconds (default: 2.0)	`2.0`
`overlap`	`float`	Overlap between segments as fraction (default: 0.5)	`0.5`

Returns:

Type	Description
`list['AudioSegment']`	List of AudioSegment objects with classifications

Example

segments = audio.classify_segments(segment_length=2.0) for seg in segments: ... print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value}")

Source code in src/videopython/base/audio/audio.py

def classify_segments(
    self,
    segment_length: float = 2.0,
    overlap: float = 0.5,
) -> list["AudioSegment"]:
    """Classify audio segments as speech, music, noise, or silence.

    This uses basic signal processing heuristics (no ML):
    - Zero-crossing rate (higher for speech/noise)
    - Spectral flatness (higher for noise, lower for music)
    - Energy distribution across frequency bands

    Args:
        segment_length: Length of each segment to classify in seconds (default: 2.0)
        overlap: Overlap between segments as fraction (default: 0.5)

    Returns:
        List of AudioSegment objects with classifications

    Example:
        >>> segments = audio.classify_segments(segment_length=2.0)
        >>> for seg in segments:
        ...     print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value}")
    """
    from videopython.base.audio.analysis import AudioSegment

    hop_length = segment_length * (1 - overlap)
    segments = []
    current_time = 0.0

    while current_time + segment_length <= self.metadata.duration_seconds:
        segment_audio = self.slice(current_time, current_time + segment_length)
        segment_type, confidence = self._classify_segment(segment_audio)
        levels = self.get_levels(current_time, current_time + segment_length)

        segments.append(
            AudioSegment(
                start=current_time,
                end=current_time + segment_length,
                segment_type=segment_type,
                confidence=confidence,
                levels=levels,
            )
        )

        current_time += hop_length

    return segments

normalize

normalize(
    target_db: float = -3.0, method: str = "peak"
) -> "Audio"

Normalize audio to a target level.

Parameters:

Name	Type	Description	Default
`target_db`	`float`	Target level in dB (default: -3.0 dB, allows headroom)	`-3.0`
`method`	`str`	Normalization method, either "peak" or "rms" (default: "peak")	`'peak'`

Returns:

Type	Description
`'Audio'`	New Audio object with normalized levels

Example

normalized = audio.normalize(target_db=-3.0, method="peak") print(f"New peak: {normalized.get_levels().db_peak:.1f} dB")

Source code in src/videopython/base/audio/audio.py

def normalize(
    self,
    target_db: float = -3.0,
    method: str = "peak",
) -> "Audio":
    """Normalize audio to a target level.

    Args:
        target_db: Target level in dB (default: -3.0 dB, allows headroom)
        method: Normalization method, either "peak" or "rms" (default: "peak")

    Returns:
        New Audio object with normalized levels

    Example:
        >>> normalized = audio.normalize(target_db=-3.0, method="peak")
        >>> print(f"New peak: {normalized.get_levels().db_peak:.1f} dB")
    """
    data = self.data.copy()

    if method == "peak":
        current_peak = np.max(np.abs(data))
        if current_peak < 1e-10:
            return Audio(data, self.metadata)

        target_amplitude = 10 ** (target_db / 20)
        scale_factor = target_amplitude / current_peak

    elif method == "rms":
        current_rms = np.sqrt(np.mean(data**2))
        if current_rms < 1e-10:
            return Audio(data, self.metadata)

        target_rms = 10 ** (target_db / 20)
        scale_factor = target_rms / current_rms

    else:
        raise ValueError(f"Unknown method: {method}. Use 'peak' or 'rms'")

    # Apply scaling
    normalized_data = data * scale_factor

    # Clip to prevent overflow (should be rare with proper target_db)
    normalized_data = np.clip(normalized_data, -1.0, 1.0)

    return Audio(normalized_data.astype(np.float32), self.metadata)

Audio Analysis

The Audio class includes methods for analyzing audio levels, detecting silence, classifying content, and normalizing.

Level Analysis

from videopython.base import Audio

audio = Audio.from_path("audio.mp3")

# Get overall levels
levels = audio.get_levels()
print(f"Peak: {levels.db_peak:.1f} dB, RMS: {levels.db_rms:.1f} dB")

# Get levels for a specific segment
segment_levels = audio.get_levels(start_seconds=1.0, end_seconds=3.0)

# Get levels over time (sliding window analysis)
levels_over_time = audio.get_levels_over_time(window_seconds=0.1)
for timestamp, levels in levels_over_time:
    print(f"{timestamp:.2f}s: {levels.db_rms:.1f} dB")

Silence Detection

from videopython.base import Audio

audio = Audio.from_path("podcast.mp3")

# Detect silent segments
silent_segments = audio.detect_silence(
    threshold_db=-40.0,  # dB threshold
    min_duration=0.5,    # minimum silence duration in seconds
)

for seg in silent_segments:
    print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s ({seg.duration:.2f}s)")

Segment Classification

Classify audio segments as speech, music, noise, or silence using heuristic analysis (no ML required).

from videopython.base import Audio

audio = Audio.from_path("mixed_content.mp3")

# Classify 2-second segments with 50% overlap
segments = audio.classify_segments(segment_length=2.0, overlap=0.5)

for seg in segments:
    print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value} ({seg.confidence:.0%})")

Normalization

from videopython.base import Audio

audio = Audio.from_path("quiet_audio.mp3")

# Peak normalization (default)
normalized = audio.normalize(target_db=-3.0, method="peak")

# RMS normalization
normalized = audio.normalize(target_db=-18.0, method="rms")

# Verify
print(f"New peak: {normalized.get_levels().db_peak:.1f} dB")

Audio Manipulation

Volume Scaling

from videopython.base import Audio

audio = Audio.from_path("audio.mp3")

# Scale volume (1.0 = no change, 0.5 = half, 2.0 = double)
quieter = audio.scale_volume(0.5)
louder = audio.scale_volume(1.5)

Time Stretching

Pitch-preserving time stretching using ffmpeg's atempo filter.

from videopython.base import Audio

audio = Audio.from_path("audio.mp3")

# Speed up 2x (half duration, same pitch)
faster = audio.time_stretch(2.0)

# Slow down 0.5x (double duration, same pitch)
slower = audio.time_stretch(0.5)

# Extreme speeds are supported via chained filters
very_fast = audio.time_stretch(4.0)

Duration Fitting

Adjust audio to match a target duration by slicing or padding with silence.

from videopython.base import Audio

audio = Audio.from_path("audio.mp3")

# Fit to exactly 10 seconds
# - If longer: slices to 10s
# - If shorter: pads with silence
fitted = audio.fit_to_duration(10.0)

Data Classes

AudioMetadata

Stores metadata for audio files including sample rate, channels, duration, and frame count.

AudioMetadata `dataclass`

Stores metadata for audio files

Source code in src/videopython/base/audio/audio.py

@dataclass
class AudioMetadata:
    """Stores metadata for audio files"""

    sample_rate: int
    channels: int
    sample_width: int  # in bytes
    duration_seconds: float
    frame_count: int

    @property
    def bits_per_sample(self) -> int:
        """Returns the number of bits per sample"""
        return self.sample_width * 8

bits_per_sample `property`

bits_per_sample: int

Returns the number of bits per sample

AudioLevels

Audio level measurements (RMS, peak, dB values).

AudioLevels `dataclass`

Audio level measurements for a segment.

Attributes:

Name	Type	Description
`rms`	`float`	Root mean square (average loudness), 0.0 to 1.0
`peak`	`float`	Maximum absolute amplitude, 0.0 to 1.0
`db_rms`	`float`	RMS level in decibels (relative to full scale)
`db_peak`	`float`	Peak level in decibels (relative to full scale)

Example

audio = Audio.from_path("audio.mp3") levels = audio.get_levels() print(f"Peak: {levels.db_peak:.1f} dB, RMS: {levels.db_rms:.1f} dB")

Source code in src/videopython/base/audio/analysis.py

@dataclass
class AudioLevels:
    """Audio level measurements for a segment.

    Attributes:
        rms: Root mean square (average loudness), 0.0 to 1.0
        peak: Maximum absolute amplitude, 0.0 to 1.0
        db_rms: RMS level in decibels (relative to full scale)
        db_peak: Peak level in decibels (relative to full scale)

    Example:
        >>> audio = Audio.from_path("audio.mp3")
        >>> levels = audio.get_levels()
        >>> print(f"Peak: {levels.db_peak:.1f} dB, RMS: {levels.db_rms:.1f} dB")
    """

    rms: float
    peak: float
    db_rms: float
    db_peak: float

SilentSegment

Represents a detected silent segment with timestamps.

SilentSegment `dataclass`

Represents a detected silent segment.

Attributes:

Name	Type	Description
`start`	`float`	Start time in seconds
`end`	`float`	End time in seconds
`duration`	`float`	Duration in seconds
`avg_level`	`float`	Average RMS level during the segment

Example

silent_segments = audio.detect_silence(threshold_db=-40.0) for seg in silent_segments: ... print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s ({seg.duration:.2f}s)")

Source code in src/videopython/base/audio/analysis.py

@dataclass
class SilentSegment:
    """Represents a detected silent segment.

    Attributes:
        start: Start time in seconds
        end: End time in seconds
        duration: Duration in seconds
        avg_level: Average RMS level during the segment

    Example:
        >>> silent_segments = audio.detect_silence(threshold_db=-40.0)
        >>> for seg in silent_segments:
        ...     print(f"Silence: {seg.start:.2f}s - {seg.end:.2f}s ({seg.duration:.2f}s)")
    """

    start: float
    end: float
    duration: float
    avg_level: float

AudioSegment

A classified segment of audio with type and confidence.

AudioSegment `dataclass`

A classified segment of audio.

Attributes:

Name	Type	Description
`start`	`float`	Start time in seconds
`end`	`float`	End time in seconds
`segment_type`	`AudioSegmentType`	Classification of the segment content
`confidence`	`float`	Confidence score for the classification (0.0 to 1.0)
`levels`	`AudioLevels`	Audio level measurements for the segment

Example

segments = audio.classify_segments(segment_length=2.0) for seg in segments: ... print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value} ({seg.confidence:.0%})")

Source code in src/videopython/base/audio/analysis.py

@dataclass
class AudioSegment:
    """A classified segment of audio.

    Attributes:
        start: Start time in seconds
        end: End time in seconds
        segment_type: Classification of the segment content
        confidence: Confidence score for the classification (0.0 to 1.0)
        levels: Audio level measurements for the segment

    Example:
        >>> segments = audio.classify_segments(segment_length=2.0)
        >>> for seg in segments:
        ...     print(f"{seg.start:.1f}-{seg.end:.1f}s: {seg.segment_type.value} ({seg.confidence:.0%})")
    """

    start: float
    end: float
    segment_type: AudioSegmentType
    confidence: float
    levels: AudioLevels

    @property
    def duration(self) -> float:
        """Duration of the segment in seconds."""
        return self.end - self.start

duration `property`

duration: float

Duration of the segment in seconds.

AudioSegmentType

Enum for audio segment classification: SILENCE, SPEECH, MUSIC, NOISE.

AudioSegmentType

Bases: Enum

Classification of audio segment content.

Source code in src/videopython/base/audio/analysis.py

class AudioSegmentType(Enum):
    """Classification of audio segment content."""

    SILENCE = "silence"
    SPEECH = "speech"
    MUSIC = "music"
    NOISE = "noise"

Exceptions

AudioLoadError

Exception raised when there's an error loading or saving audio files.

AudioLoadError

Bases: AudioError

Raised when there's an error loading audio.

Source code in src/videopython/base/exceptions.py

class AudioLoadError(AudioError):
    """Raised when there's an error loading audio."""

    pass

Audio

Audio

Audio

is_silent property

__init__

create_silent classmethod

from_path classmethod

from_file classmethod

silence classmethod

to_mono

get_channel

resample

concat

slice

scale_volume

fit_to_duration

time_stretch

overlay

save

__len__

__repr__

get_levels

get_levels_over_time

detect_silence

classify_segments

normalize

Audio Analysis

Level Analysis

Silence Detection

Segment Classification

Normalization

Audio Manipulation

Volume Scaling

Time Stretching

Duration Fitting

Data Classes

AudioMetadata

AudioMetadata dataclass

bits_per_sample property

AudioLevels

AudioLevels dataclass

SilentSegment

SilentSegment dataclass

AudioSegment

AudioSegment dataclass

duration property

AudioSegmentType

AudioSegmentType

Exceptions

AudioLoadError

AudioLoadError

is_silent `property`

init

create_silent `classmethod`

from_path `classmethod`

from_file `classmethod`

silence `classmethod`

len

repr

AudioMetadata `dataclass`

bits_per_sample `property`

AudioLevels `dataclass`

SilentSegment `dataclass`

AudioSegment `dataclass`

duration `property`