Skip to content

AI Transforms

AI-powered video transforms that use face detection for intelligent cropping and tracking. Framing-oriented behavior (headroom / thirds / speed clamp) is implemented on FaceTrackingCrop.

Usage

from videopython.ai import FaceTrackingCrop, SplitScreenComposite
from videopython.base import Video

video = Video.from_path("input.mp4")
video2 = Video.from_path("input_2.mp4")

# Create vertical content from horizontal by tracking faces
crop = FaceTrackingCrop(target_aspect=(9, 16))
vertical_video = crop.apply(video)

# Face-tracking crop with headroom framing and limited camera speed
framing = FaceTrackingCrop(framing_rule="headroom", max_speed=0.1)
framed_video = framing.apply(video)

# Create split-screen with face tracking
composite = SplitScreenComposite(layout="2x1")
split_video = composite.apply(video, video2)

FaceTracker

FaceTracker

Utility for tracking faces across video frames with smoothing.

Provides frame-by-frame face detection with position smoothing using exponential moving average to prevent jitter in the tracked position.

Supports GPU acceleration via YOLOv8-face model for significantly faster detection, with optional frame sampling and interpolation for video.

Example

CPU tracking (default, backward compatible)

tracker = FaceTracker() for i, frame in enumerate(frames): ... pos = tracker.detect_and_track(frame, i)

GPU tracking with frame sampling

tracker = FaceTracker(backend="gpu", sample_rate=5) positions = tracker.track_video(frames)

Source code in src/videopython/ai/transforms.py
class FaceTracker:
    """Utility for tracking faces across video frames with smoothing.

    Provides frame-by-frame face detection with position smoothing using
    exponential moving average to prevent jitter in the tracked position.

    Supports GPU acceleration via YOLOv8-face model for significantly faster
    detection, with optional frame sampling and interpolation for video.

    Example:
        >>> # CPU tracking (default, backward compatible)
        >>> tracker = FaceTracker()
        >>> for i, frame in enumerate(frames):
        ...     pos = tracker.detect_and_track(frame, i)
        >>>
        >>> # GPU tracking with frame sampling
        >>> tracker = FaceTracker(backend="gpu", sample_rate=5)
        >>> positions = tracker.track_video(frames)
    """

    def __init__(
        self,
        selection_strategy: Literal["largest", "centered", "index"] = "largest",
        face_index: int = 0,
        smoothing: float = 0.8,
        detection_interval: int = 3,
        min_face_size: int = 30,
        backend: Literal["cpu", "gpu", "auto"] = "cpu",
        sample_rate: int = 1,
        batch_size: int = 16,
    ):
        """Initialize face tracker.

        Args:
            selection_strategy: How to select which face to track.
                - "largest": Track the face with the largest bounding box.
                - "centered": Track the face closest to frame center.
                - "index": Track the face at a specific index (sorted by area).
            face_index: Index of face to track when using "index" strategy.
            smoothing: Exponential moving average factor (0-1). Higher = smoother.
            detection_interval: Run detection every N frames, interpolate between.
            min_face_size: Minimum face size in pixels for detection.
            backend: Detection backend - "cpu", "gpu", or "auto".
            sample_rate: For GPU backend, detect every Nth frame and interpolate.
                Only used by track_video(). Default 1 (every frame).
            batch_size: Batch size for GPU detection. Default 16.
        """
        self.selection_strategy = selection_strategy
        self.face_index = face_index
        self.smoothing = smoothing
        self.detection_interval = detection_interval
        self.min_face_size = min_face_size
        self.backend: Literal["cpu", "gpu", "auto"] = backend
        self.sample_rate = sample_rate
        self.batch_size = batch_size

        self._detector: FaceDetector | None = None
        self._last_position: tuple[float, float] | None = None
        self._last_size: tuple[float, float] | None = None
        self._smoothed_position: tuple[float, float] | None = None
        self._smoothed_size: tuple[float, float] | None = None

    def _init_detector(self) -> None:
        """Initialize face detector lazily."""
        self._detector = FaceDetector(
            min_face_size=self.min_face_size,
            backend=self.backend,
        )

    def _select_face(
        self,
        faces: list,
        frame_width: int,
        frame_height: int,
    ) -> tuple[float, float, float, float] | None:
        """Select a face based on the configured strategy.

        Args:
            faces: List of DetectedFace objects.
            frame_width: Width of the frame.
            frame_height: Height of the frame.

        Returns:
            Tuple of (center_x, center_y, width, height) in normalized coords, or None.
        """
        if not faces:
            return None

        if self.selection_strategy == "largest":
            # Faces are already sorted by area (largest first)
            face = faces[0]
        elif self.selection_strategy == "centered":
            # Find face closest to center
            frame_center = (0.5, 0.5)
            face = min(
                faces,
                key=lambda f: (
                    (f.bounding_box.center[0] - frame_center[0]) ** 2
                    + (f.bounding_box.center[1] - frame_center[1]) ** 2
                ),
            )
        elif self.selection_strategy == "index":
            if self.face_index < len(faces):
                face = faces[self.face_index]
            else:
                face = faces[0]  # Fall back to largest
        else:
            face = faces[0]

        bbox = face.bounding_box
        return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)

    def detect_and_track(
        self,
        frame: np.ndarray,
        frame_index: int,
    ) -> tuple[float, float, float, float] | None:
        """Detect face in frame and return smoothed position.

        Args:
            frame: Video frame as numpy array (H, W, 3).
            frame_index: Index of current frame.

        Returns:
            Tuple of (center_x, center_y, width, height) in normalized coords,
            or None if no face detected and no fallback available.
        """
        if self._detector is None:
            self._init_detector()
            assert self._detector is not None

        h, w = frame.shape[:2]

        # Only run detection on interval frames
        should_detect = frame_index % self.detection_interval == 0

        if should_detect:
            faces = self._detector.detect(frame)
            face_info = self._select_face(faces, w, h)

            if face_info:
                cx, cy, fw, fh = face_info
                self._last_position = (cx, cy)
                self._last_size = (fw, fh)
        else:
            # Use last detected position
            face_info = None
            if self._last_position and self._last_size:
                face_info = (*self._last_position, *self._last_size)

        if face_info:
            cx, cy, fw, fh = face_info

            # Apply exponential moving average smoothing
            if self._smoothed_position is None:
                self._smoothed_position = (cx, cy)
                self._smoothed_size = (fw, fh)
            else:
                alpha = 1 - self.smoothing
                self._smoothed_position = (
                    self._smoothed_position[0] * self.smoothing + cx * alpha,
                    self._smoothed_position[1] * self.smoothing + cy * alpha,
                )
                assert self._smoothed_size is not None  # Set alongside _smoothed_position
                self._smoothed_size = (
                    self._smoothed_size[0] * self.smoothing + fw * alpha,
                    self._smoothed_size[1] * self.smoothing + fh * alpha,
                )

            return (*self._smoothed_position, *self._smoothed_size)

        # Return last smoothed position as fallback
        if self._smoothed_position and self._smoothed_size:
            return (*self._smoothed_position, *self._smoothed_size)

        return None

    def reset(self) -> None:
        """Reset tracker state for a new video."""
        self._last_position = None
        self._last_size = None
        self._smoothed_position = None
        self._smoothed_size = None

    @staticmethod
    def _interpolate_bbox(
        bbox1: tuple[float, float, float, float],
        bbox2: tuple[float, float, float, float],
        t: float,
    ) -> tuple[float, float, float, float]:
        """Linearly interpolate between two bounding boxes.

        Args:
            bbox1: First bounding box (cx, cy, w, h).
            bbox2: Second bounding box (cx, cy, w, h).
            t: Interpolation factor (0 = bbox1, 1 = bbox2).

        Returns:
            Interpolated bounding box (cx, cy, w, h).
        """
        return (
            bbox1[0] + (bbox2[0] - bbox1[0]) * t,
            bbox1[1] + (bbox2[1] - bbox1[1]) * t,
            bbox1[2] + (bbox2[2] - bbox1[2]) * t,
            bbox1[3] + (bbox2[3] - bbox1[3]) * t,
        )

    def track_video(
        self,
        frames: np.ndarray,
    ) -> list[tuple[float, float, float, float] | None]:
        """Track face through entire video using optimized batch detection.

        This method is optimized for GPU backends with frame sampling and
        interpolation for smooth tracking with reduced computation.

        Args:
            frames: Video frames array of shape (N, H, W, 3).

        Returns:
            List of face positions (cx, cy, w, h) for each frame, or None if
            no face detected and no fallback available.
        """
        if self._detector is None:
            self._init_detector()
            assert self._detector is not None

        n_frames = len(frames)
        if n_frames == 0:
            return []

        h, w = frames[0].shape[:2]

        # Determine which frames to sample
        if self.sample_rate > 1 and self.backend in ("gpu", "auto"):
            sample_indices = list(range(0, n_frames, self.sample_rate))
            # Ensure last frame is included
            if sample_indices[-1] != n_frames - 1:
                sample_indices.append(n_frames - 1)
        else:
            sample_indices = list(range(n_frames))

        # Batch detect on sampled frames
        sampled_frames = [frames[i] for i in sample_indices]

        # Process in batches
        sampled_detections: list[list] = []
        for batch_start in range(0, len(sampled_frames), self.batch_size):
            batch_end = min(batch_start + self.batch_size, len(sampled_frames))
            batch = sampled_frames[batch_start:batch_end]
            batch_results = self._detector.detect_batch(batch)
            sampled_detections.extend(batch_results)

        # Extract face info from detections
        sampled_faces: list[tuple[float, float, float, float] | None] = []
        for faces in sampled_detections:
            face_info = self._select_face(faces, w, h)
            sampled_faces.append(face_info)

        # If no sampling, apply smoothing directly
        if self.sample_rate == 1 or self.backend == "cpu":
            self.reset()
            results: list[tuple[float, float, float, float] | None] = []
            for i, face_info in enumerate(sampled_faces):
                if face_info:
                    cx, cy, fw, fh = face_info
                    self._last_position = (cx, cy)
                    self._last_size = (fw, fh)

                    if self._smoothed_position is None:
                        self._smoothed_position = (cx, cy)
                        self._smoothed_size = (fw, fh)
                    else:
                        alpha = 1 - self.smoothing
                        self._smoothed_position = (
                            self._smoothed_position[0] * self.smoothing + cx * alpha,
                            self._smoothed_position[1] * self.smoothing + cy * alpha,
                        )
                        assert self._smoothed_size is not None
                        self._smoothed_size = (
                            self._smoothed_size[0] * self.smoothing + fw * alpha,
                            self._smoothed_size[1] * self.smoothing + fh * alpha,
                        )

                    results.append((*self._smoothed_position, *self._smoothed_size))
                elif self._smoothed_position and self._smoothed_size:
                    results.append((*self._smoothed_position, *self._smoothed_size))
                else:
                    results.append(None)
            return results

        # Interpolate between sampled frames
        all_positions: list[tuple[float, float, float, float] | None] = [None] * n_frames

        # Fill in sampled positions
        for idx, sample_idx in enumerate(sample_indices):
            all_positions[sample_idx] = sampled_faces[idx]

        # Interpolate gaps
        for i in range(len(sample_indices) - 1):
            start_idx = sample_indices[i]
            end_idx = sample_indices[i + 1]
            start_face = sampled_faces[i]
            end_face = sampled_faces[i + 1]

            if start_face is None and end_face is None:
                continue
            elif start_face is None:
                # Use end face for all
                for j in range(start_idx, end_idx):
                    all_positions[j] = end_face
            elif end_face is None:
                # Use start face for all
                for j in range(start_idx + 1, end_idx + 1):
                    all_positions[j] = start_face
            else:
                # Interpolate
                gap = end_idx - start_idx
                for j in range(start_idx + 1, end_idx):
                    t = (j - start_idx) / gap
                    all_positions[j] = self._interpolate_bbox(start_face, end_face, t)

        # Apply smoothing to interpolated positions
        self.reset()
        results = []
        for face_info in all_positions:
            if face_info:
                cx, cy, fw, fh = face_info

                if self._smoothed_position is None:
                    self._smoothed_position = (cx, cy)
                    self._smoothed_size = (fw, fh)
                else:
                    alpha = 1 - self.smoothing
                    self._smoothed_position = (
                        self._smoothed_position[0] * self.smoothing + cx * alpha,
                        self._smoothed_position[1] * self.smoothing + cy * alpha,
                    )
                    assert self._smoothed_size is not None
                    self._smoothed_size = (
                        self._smoothed_size[0] * self.smoothing + fw * alpha,
                        self._smoothed_size[1] * self.smoothing + fh * alpha,
                    )

                results.append((*self._smoothed_position, *self._smoothed_size))
            elif self._smoothed_position and self._smoothed_size:
                results.append((*self._smoothed_position, *self._smoothed_size))
            else:
                results.append(None)

        return results

__init__

__init__(
    selection_strategy: Literal[
        "largest", "centered", "index"
    ] = "largest",
    face_index: int = 0,
    smoothing: float = 0.8,
    detection_interval: int = 3,
    min_face_size: int = 30,
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
    batch_size: int = 16,
)

Initialize face tracker.

Parameters:

Name Type Description Default
selection_strategy Literal['largest', 'centered', 'index']

How to select which face to track. - "largest": Track the face with the largest bounding box. - "centered": Track the face closest to frame center. - "index": Track the face at a specific index (sorted by area).

'largest'
face_index int

Index of face to track when using "index" strategy.

0
smoothing float

Exponential moving average factor (0-1). Higher = smoother.

0.8
detection_interval int

Run detection every N frames, interpolate between.

3
min_face_size int

Minimum face size in pixels for detection.

30
backend Literal['cpu', 'gpu', 'auto']

Detection backend - "cpu", "gpu", or "auto".

'cpu'
sample_rate int

For GPU backend, detect every Nth frame and interpolate. Only used by track_video(). Default 1 (every frame).

1
batch_size int

Batch size for GPU detection. Default 16.

16
Source code in src/videopython/ai/transforms.py
def __init__(
    self,
    selection_strategy: Literal["largest", "centered", "index"] = "largest",
    face_index: int = 0,
    smoothing: float = 0.8,
    detection_interval: int = 3,
    min_face_size: int = 30,
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
    batch_size: int = 16,
):
    """Initialize face tracker.

    Args:
        selection_strategy: How to select which face to track.
            - "largest": Track the face with the largest bounding box.
            - "centered": Track the face closest to frame center.
            - "index": Track the face at a specific index (sorted by area).
        face_index: Index of face to track when using "index" strategy.
        smoothing: Exponential moving average factor (0-1). Higher = smoother.
        detection_interval: Run detection every N frames, interpolate between.
        min_face_size: Minimum face size in pixels for detection.
        backend: Detection backend - "cpu", "gpu", or "auto".
        sample_rate: For GPU backend, detect every Nth frame and interpolate.
            Only used by track_video(). Default 1 (every frame).
        batch_size: Batch size for GPU detection. Default 16.
    """
    self.selection_strategy = selection_strategy
    self.face_index = face_index
    self.smoothing = smoothing
    self.detection_interval = detection_interval
    self.min_face_size = min_face_size
    self.backend: Literal["cpu", "gpu", "auto"] = backend
    self.sample_rate = sample_rate
    self.batch_size = batch_size

    self._detector: FaceDetector | None = None
    self._last_position: tuple[float, float] | None = None
    self._last_size: tuple[float, float] | None = None
    self._smoothed_position: tuple[float, float] | None = None
    self._smoothed_size: tuple[float, float] | None = None

detect_and_track

detect_and_track(
    frame: ndarray, frame_index: int
) -> tuple[float, float, float, float] | None

Detect face in frame and return smoothed position.

Parameters:

Name Type Description Default
frame ndarray

Video frame as numpy array (H, W, 3).

required
frame_index int

Index of current frame.

required

Returns:

Type Description
tuple[float, float, float, float] | None

Tuple of (center_x, center_y, width, height) in normalized coords,

tuple[float, float, float, float] | None

or None if no face detected and no fallback available.

Source code in src/videopython/ai/transforms.py
def detect_and_track(
    self,
    frame: np.ndarray,
    frame_index: int,
) -> tuple[float, float, float, float] | None:
    """Detect face in frame and return smoothed position.

    Args:
        frame: Video frame as numpy array (H, W, 3).
        frame_index: Index of current frame.

    Returns:
        Tuple of (center_x, center_y, width, height) in normalized coords,
        or None if no face detected and no fallback available.
    """
    if self._detector is None:
        self._init_detector()
        assert self._detector is not None

    h, w = frame.shape[:2]

    # Only run detection on interval frames
    should_detect = frame_index % self.detection_interval == 0

    if should_detect:
        faces = self._detector.detect(frame)
        face_info = self._select_face(faces, w, h)

        if face_info:
            cx, cy, fw, fh = face_info
            self._last_position = (cx, cy)
            self._last_size = (fw, fh)
    else:
        # Use last detected position
        face_info = None
        if self._last_position and self._last_size:
            face_info = (*self._last_position, *self._last_size)

    if face_info:
        cx, cy, fw, fh = face_info

        # Apply exponential moving average smoothing
        if self._smoothed_position is None:
            self._smoothed_position = (cx, cy)
            self._smoothed_size = (fw, fh)
        else:
            alpha = 1 - self.smoothing
            self._smoothed_position = (
                self._smoothed_position[0] * self.smoothing + cx * alpha,
                self._smoothed_position[1] * self.smoothing + cy * alpha,
            )
            assert self._smoothed_size is not None  # Set alongside _smoothed_position
            self._smoothed_size = (
                self._smoothed_size[0] * self.smoothing + fw * alpha,
                self._smoothed_size[1] * self.smoothing + fh * alpha,
            )

        return (*self._smoothed_position, *self._smoothed_size)

    # Return last smoothed position as fallback
    if self._smoothed_position and self._smoothed_size:
        return (*self._smoothed_position, *self._smoothed_size)

    return None

reset

reset() -> None

Reset tracker state for a new video.

Source code in src/videopython/ai/transforms.py
def reset(self) -> None:
    """Reset tracker state for a new video."""
    self._last_position = None
    self._last_size = None
    self._smoothed_position = None
    self._smoothed_size = None

track_video

track_video(
    frames: ndarray,
) -> list[tuple[float, float, float, float] | None]

Track face through entire video using optimized batch detection.

This method is optimized for GPU backends with frame sampling and interpolation for smooth tracking with reduced computation.

Parameters:

Name Type Description Default
frames ndarray

Video frames array of shape (N, H, W, 3).

required

Returns:

Type Description
list[tuple[float, float, float, float] | None]

List of face positions (cx, cy, w, h) for each frame, or None if

list[tuple[float, float, float, float] | None]

no face detected and no fallback available.

Source code in src/videopython/ai/transforms.py
def track_video(
    self,
    frames: np.ndarray,
) -> list[tuple[float, float, float, float] | None]:
    """Track face through entire video using optimized batch detection.

    This method is optimized for GPU backends with frame sampling and
    interpolation for smooth tracking with reduced computation.

    Args:
        frames: Video frames array of shape (N, H, W, 3).

    Returns:
        List of face positions (cx, cy, w, h) for each frame, or None if
        no face detected and no fallback available.
    """
    if self._detector is None:
        self._init_detector()
        assert self._detector is not None

    n_frames = len(frames)
    if n_frames == 0:
        return []

    h, w = frames[0].shape[:2]

    # Determine which frames to sample
    if self.sample_rate > 1 and self.backend in ("gpu", "auto"):
        sample_indices = list(range(0, n_frames, self.sample_rate))
        # Ensure last frame is included
        if sample_indices[-1] != n_frames - 1:
            sample_indices.append(n_frames - 1)
    else:
        sample_indices = list(range(n_frames))

    # Batch detect on sampled frames
    sampled_frames = [frames[i] for i in sample_indices]

    # Process in batches
    sampled_detections: list[list] = []
    for batch_start in range(0, len(sampled_frames), self.batch_size):
        batch_end = min(batch_start + self.batch_size, len(sampled_frames))
        batch = sampled_frames[batch_start:batch_end]
        batch_results = self._detector.detect_batch(batch)
        sampled_detections.extend(batch_results)

    # Extract face info from detections
    sampled_faces: list[tuple[float, float, float, float] | None] = []
    for faces in sampled_detections:
        face_info = self._select_face(faces, w, h)
        sampled_faces.append(face_info)

    # If no sampling, apply smoothing directly
    if self.sample_rate == 1 or self.backend == "cpu":
        self.reset()
        results: list[tuple[float, float, float, float] | None] = []
        for i, face_info in enumerate(sampled_faces):
            if face_info:
                cx, cy, fw, fh = face_info
                self._last_position = (cx, cy)
                self._last_size = (fw, fh)

                if self._smoothed_position is None:
                    self._smoothed_position = (cx, cy)
                    self._smoothed_size = (fw, fh)
                else:
                    alpha = 1 - self.smoothing
                    self._smoothed_position = (
                        self._smoothed_position[0] * self.smoothing + cx * alpha,
                        self._smoothed_position[1] * self.smoothing + cy * alpha,
                    )
                    assert self._smoothed_size is not None
                    self._smoothed_size = (
                        self._smoothed_size[0] * self.smoothing + fw * alpha,
                        self._smoothed_size[1] * self.smoothing + fh * alpha,
                    )

                results.append((*self._smoothed_position, *self._smoothed_size))
            elif self._smoothed_position and self._smoothed_size:
                results.append((*self._smoothed_position, *self._smoothed_size))
            else:
                results.append(None)
        return results

    # Interpolate between sampled frames
    all_positions: list[tuple[float, float, float, float] | None] = [None] * n_frames

    # Fill in sampled positions
    for idx, sample_idx in enumerate(sample_indices):
        all_positions[sample_idx] = sampled_faces[idx]

    # Interpolate gaps
    for i in range(len(sample_indices) - 1):
        start_idx = sample_indices[i]
        end_idx = sample_indices[i + 1]
        start_face = sampled_faces[i]
        end_face = sampled_faces[i + 1]

        if start_face is None and end_face is None:
            continue
        elif start_face is None:
            # Use end face for all
            for j in range(start_idx, end_idx):
                all_positions[j] = end_face
        elif end_face is None:
            # Use start face for all
            for j in range(start_idx + 1, end_idx + 1):
                all_positions[j] = start_face
        else:
            # Interpolate
            gap = end_idx - start_idx
            for j in range(start_idx + 1, end_idx):
                t = (j - start_idx) / gap
                all_positions[j] = self._interpolate_bbox(start_face, end_face, t)

    # Apply smoothing to interpolated positions
    self.reset()
    results = []
    for face_info in all_positions:
        if face_info:
            cx, cy, fw, fh = face_info

            if self._smoothed_position is None:
                self._smoothed_position = (cx, cy)
                self._smoothed_size = (fw, fh)
            else:
                alpha = 1 - self.smoothing
                self._smoothed_position = (
                    self._smoothed_position[0] * self.smoothing + cx * alpha,
                    self._smoothed_position[1] * self.smoothing + cy * alpha,
                )
                assert self._smoothed_size is not None
                self._smoothed_size = (
                    self._smoothed_size[0] * self.smoothing + fw * alpha,
                    self._smoothed_size[1] * self.smoothing + fh * alpha,
                )

            results.append((*self._smoothed_position, *self._smoothed_size))
        elif self._smoothed_position and self._smoothed_size:
            results.append((*self._smoothed_position, *self._smoothed_size))
        else:
            results.append(None)

    return results

FaceTrackingCrop

FaceTrackingCrop

Bases: Transformation

Crops video to follow detected faces.

Useful for creating vertical (9:16) content from horizontal (16:9) video by tracking the speaker's face and keeping it framed.

Supports GPU acceleration for faster processing with optional frame sampling. Also supports simple cinematographic framing rules (headroom / thirds) and optional movement speed clamping.

Example

CPU (default, backward compatible)

video = FaceTrackingCrop().apply(video)

GPU with frame sampling for speed

video = FaceTrackingCrop(backend="gpu", sample_rate=5).apply(video)

Source code in src/videopython/ai/transforms.py
class FaceTrackingCrop(Transformation):
    """Crops video to follow detected faces.

    Useful for creating vertical (9:16) content from horizontal (16:9) video
    by tracking the speaker's face and keeping it framed.

    Supports GPU acceleration for faster processing with optional frame sampling.
    Also supports simple cinematographic framing rules (headroom / thirds) and
    optional movement speed clamping.

    Example:
        >>> # CPU (default, backward compatible)
        >>> video = FaceTrackingCrop().apply(video)
        >>>
        >>> # GPU with frame sampling for speed
        >>> video = FaceTrackingCrop(backend="gpu", sample_rate=5).apply(video)
    """

    def __init__(
        self,
        target_aspect: tuple[int, int] = (9, 16),
        face_selection: Literal["largest", "centered", "index"] = "largest",
        face_index: int | None = None,
        padding: float = 0.3,
        vertical_offset: float = -0.1,
        framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
        headroom: float = 0.15,
        lead_room: float = 0.1,
        smoothing: float = 0.8,
        max_speed: float | None = None,
        fallback: Literal["center", "last_position", "full_frame"] = "last_position",
        detection_interval: int = 3,
        backend: Literal["cpu", "gpu", "auto"] = "cpu",
        sample_rate: int = 1,
    ):
        """Initialize face tracking crop.

        Args:
            target_aspect: Output aspect ratio as (width, height).
            face_selection: Strategy for selecting which face to track.
            face_index: Index of face to track when using "index" selection.
            padding: Extra space around face (0.3 = 30% padding on each side).
            vertical_offset: Legacy vertical position offset used by ``framing_rule="offset"``.
            framing_rule: Subject framing strategy.
                - "offset": Use legacy ``vertical_offset`` behavior.
                - "center": Keep face centered.
                - "headroom": Keep extra room above the face.
                - "thirds": Place face near the upper-third line.
                - "dynamic": Currently same as "headroom".
            headroom: Headroom amount for framing rules that use it.
            lead_room: Reserved for future motion/look-direction framing.
            smoothing: Position smoothing factor (0-1, higher = smoother).
            max_speed: Optional max camera movement per frame (normalized).
            fallback: Behavior when no face detected.
            detection_interval: Frames between face detections.
            backend: Detection backend - "cpu", "gpu", or "auto".
            sample_rate: For GPU backend, detect every Nth frame and interpolate.
        """
        self.target_aspect = target_aspect
        self.face_selection = face_selection
        self.face_index = face_index if face_index is not None else 0
        self.padding = padding
        self.vertical_offset = vertical_offset
        self.framing_rule = framing_rule
        self.headroom = headroom
        self.lead_room = lead_room
        self.smoothing = smoothing
        self.max_speed = max_speed
        self.fallback = fallback
        self.detection_interval = detection_interval
        self.backend: Literal["cpu", "gpu", "auto"] = backend
        self.sample_rate = sample_rate

    def _apply_framing_offset(
        self,
        face_cx: float,
        face_cy: float,
        face_h: float,
    ) -> tuple[float, float]:
        """Apply framing rule to get desired crop center in normalized coords."""
        if self.framing_rule == "offset":
            return (face_cx, face_cy + self.vertical_offset)
        if self.framing_rule == "center":
            return (face_cx, face_cy)
        if self.framing_rule == "headroom":
            return (face_cx, face_cy - self.headroom)
        if self.framing_rule == "thirds":
            return (face_cx, face_cy - (1 / 3 - 0.5))
        if self.framing_rule == "dynamic":
            # Placeholder until motion/look-direction framing is implemented.
            return (face_cx, face_cy - self.headroom)
        return (face_cx, face_cy)

    def _clamp_speed(
        self,
        current: tuple[float, float],
        target: tuple[float, float],
    ) -> tuple[float, float]:
        """Clamp crop-center movement speed if ``max_speed`` is configured."""
        if self.max_speed is None:
            return target

        dx = target[0] - current[0]
        dy = target[1] - current[1]
        distance = (dx**2 + dy**2) ** 0.5

        if distance <= self.max_speed:
            return target
        if distance == 0:
            return target

        scale = self.max_speed / distance
        return (current[0] + dx * scale, current[1] + dy * scale)

    def _calculate_crop_region(
        self,
        face_cx: float,
        face_cy: float,
        face_w: float,
        face_h: float,
        frame_w: int,
        frame_h: int,
        center_position: tuple[float, float] | None = None,
    ) -> tuple[int, int, int, int]:
        """Calculate crop region centered on face with padding and framing.

        Args:
            face_cx, face_cy: Face center in normalized coords.
            face_w, face_h: Face dimensions in normalized coords.
            frame_w, frame_h: Frame dimensions in pixels.
            center_position: Optional crop center override in normalized coords.

        Returns:
            Tuple of (x, y, width, height) for crop region in pixels.
        """
        target_ratio = self.target_aspect[0] / self.target_aspect[1]
        frame_ratio = frame_w / frame_h

        # Calculate crop size to achieve target aspect ratio
        # Use _make_even to ensure H.264 compatibility
        if target_ratio < frame_ratio:
            # Target is taller (e.g., 9:16) - height limited
            crop_h = _make_even(frame_h)
            crop_w = _make_even(int(crop_h * target_ratio))
        else:
            # Target is wider - width limited
            crop_w = _make_even(frame_w)
            crop_h = _make_even(int(crop_w / target_ratio))

        # Calculate minimum crop size based on face + padding
        min_face_dim = max(face_w * frame_w, face_h * frame_h)
        min_crop_dim = min_face_dim * (1 + 2 * self.padding)

        # Ensure crop is at least large enough for face with padding
        if crop_w < min_crop_dim * target_ratio:
            crop_w = _make_even(min(int(min_crop_dim * target_ratio), frame_w))
            crop_h = _make_even(min(int(crop_w / target_ratio), frame_h))

        if center_position is None:
            center_position = self._apply_framing_offset(face_cx, face_cy, face_h)

        center_x = center_position[0] * frame_w
        center_y = center_position[1] * frame_h

        x = int(center_x - crop_w / 2)
        y = int(center_y - crop_h / 2)

        # Clamp to frame bounds
        x = max(0, min(x, frame_w - crop_w))
        y = max(0, min(y, frame_h - crop_h))

        return (x, y, crop_w, crop_h)

    def apply(self, video: Video) -> Video:
        """Apply face tracking crop to video.

        Args:
            video: Input video.

        Returns:
            Video cropped to follow faces.
        """
        tracker = FaceTracker(
            selection_strategy=self.face_selection,
            face_index=self.face_index,
            smoothing=self.smoothing,
            detection_interval=self.detection_interval,
            backend=self.backend,
            sample_rate=self.sample_rate,
        )

        h, w = video.frame_shape[:2]
        target_ratio = self.target_aspect[0] / self.target_aspect[1]

        # Calculate output dimensions maintaining target aspect ratio
        # Use _make_even to ensure H.264 compatibility (requires even dimensions)
        if target_ratio < w / h:
            out_h = _make_even(h)
            out_w = _make_even(int(out_h * target_ratio))
        else:
            out_w = _make_even(w)
            out_h = _make_even(int(out_w / target_ratio))

        # Default crop region (center)
        default_x = (w - out_w) // 2
        default_y = (h - out_h) // 2
        last_crop = (default_x, default_y, out_w, out_h)
        current_position = (0.5, 0.5)

        framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
        print(
            "Face tracking crop: "
            f"{w}x{h} -> {out_w}x{out_h} "
            f"({self.target_aspect[0]}:{self.target_aspect[1]}, framing={framing_label})"
        )

        new_frames = []
        for i in tqdm(range(len(video.frames)), desc="Face tracking crop"):
            frame = video.frames[i]
            face_info = tracker.detect_and_track(frame, i)

            if face_info:
                cx, cy, fw, fh = face_info
                target_position = self._apply_framing_offset(cx, cy, fh)
                current_position = self._clamp_speed(current_position, target_position)
                crop = self._calculate_crop_region(
                    cx,
                    cy,
                    fw,
                    fh,
                    w,
                    h,
                    center_position=current_position,
                )
                last_crop = crop
            else:
                # Fallback behavior
                if self.fallback == "center":
                    crop = (default_x, default_y, out_w, out_h)
                elif self.fallback == "last_position":
                    crop = last_crop
                else:  # full_frame
                    crop = (0, 0, w, h)

            x, y, cw, ch = crop
            cropped = frame[y : y + ch, x : x + cw]

            # Resize to output dimensions if needed
            if cropped.shape[1] != out_w or cropped.shape[0] != out_h:
                cropped = cv2.resize(cropped, (out_w, out_h), interpolation=cv2.INTER_AREA)

            new_frames.append(cropped)

        video.frames = np.array(new_frames, dtype=np.uint8)
        return video

__init__

__init__(
    target_aspect: tuple[int, int] = (9, 16),
    face_selection: Literal[
        "largest", "centered", "index"
    ] = "largest",
    face_index: int | None = None,
    padding: float = 0.3,
    vertical_offset: float = -0.1,
    framing_rule: Literal[
        "offset", "center", "headroom", "thirds", "dynamic"
    ] = "offset",
    headroom: float = 0.15,
    lead_room: float = 0.1,
    smoothing: float = 0.8,
    max_speed: float | None = None,
    fallback: Literal[
        "center", "last_position", "full_frame"
    ] = "last_position",
    detection_interval: int = 3,
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
)

Initialize face tracking crop.

Parameters:

Name Type Description Default
target_aspect tuple[int, int]

Output aspect ratio as (width, height).

(9, 16)
face_selection Literal['largest', 'centered', 'index']

Strategy for selecting which face to track.

'largest'
face_index int | None

Index of face to track when using "index" selection.

None
padding float

Extra space around face (0.3 = 30% padding on each side).

0.3
vertical_offset float

Legacy vertical position offset used by framing_rule="offset".

-0.1
framing_rule Literal['offset', 'center', 'headroom', 'thirds', 'dynamic']

Subject framing strategy. - "offset": Use legacy vertical_offset behavior. - "center": Keep face centered. - "headroom": Keep extra room above the face. - "thirds": Place face near the upper-third line. - "dynamic": Currently same as "headroom".

'offset'
headroom float

Headroom amount for framing rules that use it.

0.15
lead_room float

Reserved for future motion/look-direction framing.

0.1
smoothing float

Position smoothing factor (0-1, higher = smoother).

0.8
max_speed float | None

Optional max camera movement per frame (normalized).

None
fallback Literal['center', 'last_position', 'full_frame']

Behavior when no face detected.

'last_position'
detection_interval int

Frames between face detections.

3
backend Literal['cpu', 'gpu', 'auto']

Detection backend - "cpu", "gpu", or "auto".

'cpu'
sample_rate int

For GPU backend, detect every Nth frame and interpolate.

1
Source code in src/videopython/ai/transforms.py
def __init__(
    self,
    target_aspect: tuple[int, int] = (9, 16),
    face_selection: Literal["largest", "centered", "index"] = "largest",
    face_index: int | None = None,
    padding: float = 0.3,
    vertical_offset: float = -0.1,
    framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
    headroom: float = 0.15,
    lead_room: float = 0.1,
    smoothing: float = 0.8,
    max_speed: float | None = None,
    fallback: Literal["center", "last_position", "full_frame"] = "last_position",
    detection_interval: int = 3,
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
):
    """Initialize face tracking crop.

    Args:
        target_aspect: Output aspect ratio as (width, height).
        face_selection: Strategy for selecting which face to track.
        face_index: Index of face to track when using "index" selection.
        padding: Extra space around face (0.3 = 30% padding on each side).
        vertical_offset: Legacy vertical position offset used by ``framing_rule="offset"``.
        framing_rule: Subject framing strategy.
            - "offset": Use legacy ``vertical_offset`` behavior.
            - "center": Keep face centered.
            - "headroom": Keep extra room above the face.
            - "thirds": Place face near the upper-third line.
            - "dynamic": Currently same as "headroom".
        headroom: Headroom amount for framing rules that use it.
        lead_room: Reserved for future motion/look-direction framing.
        smoothing: Position smoothing factor (0-1, higher = smoother).
        max_speed: Optional max camera movement per frame (normalized).
        fallback: Behavior when no face detected.
        detection_interval: Frames between face detections.
        backend: Detection backend - "cpu", "gpu", or "auto".
        sample_rate: For GPU backend, detect every Nth frame and interpolate.
    """
    self.target_aspect = target_aspect
    self.face_selection = face_selection
    self.face_index = face_index if face_index is not None else 0
    self.padding = padding
    self.vertical_offset = vertical_offset
    self.framing_rule = framing_rule
    self.headroom = headroom
    self.lead_room = lead_room
    self.smoothing = smoothing
    self.max_speed = max_speed
    self.fallback = fallback
    self.detection_interval = detection_interval
    self.backend: Literal["cpu", "gpu", "auto"] = backend
    self.sample_rate = sample_rate

apply

apply(video: Video) -> Video

Apply face tracking crop to video.

Parameters:

Name Type Description Default
video Video

Input video.

required

Returns:

Type Description
Video

Video cropped to follow faces.

Source code in src/videopython/ai/transforms.py
def apply(self, video: Video) -> Video:
    """Apply face tracking crop to video.

    Args:
        video: Input video.

    Returns:
        Video cropped to follow faces.
    """
    tracker = FaceTracker(
        selection_strategy=self.face_selection,
        face_index=self.face_index,
        smoothing=self.smoothing,
        detection_interval=self.detection_interval,
        backend=self.backend,
        sample_rate=self.sample_rate,
    )

    h, w = video.frame_shape[:2]
    target_ratio = self.target_aspect[0] / self.target_aspect[1]

    # Calculate output dimensions maintaining target aspect ratio
    # Use _make_even to ensure H.264 compatibility (requires even dimensions)
    if target_ratio < w / h:
        out_h = _make_even(h)
        out_w = _make_even(int(out_h * target_ratio))
    else:
        out_w = _make_even(w)
        out_h = _make_even(int(out_w / target_ratio))

    # Default crop region (center)
    default_x = (w - out_w) // 2
    default_y = (h - out_h) // 2
    last_crop = (default_x, default_y, out_w, out_h)
    current_position = (0.5, 0.5)

    framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
    print(
        "Face tracking crop: "
        f"{w}x{h} -> {out_w}x{out_h} "
        f"({self.target_aspect[0]}:{self.target_aspect[1]}, framing={framing_label})"
    )

    new_frames = []
    for i in tqdm(range(len(video.frames)), desc="Face tracking crop"):
        frame = video.frames[i]
        face_info = tracker.detect_and_track(frame, i)

        if face_info:
            cx, cy, fw, fh = face_info
            target_position = self._apply_framing_offset(cx, cy, fh)
            current_position = self._clamp_speed(current_position, target_position)
            crop = self._calculate_crop_region(
                cx,
                cy,
                fw,
                fh,
                w,
                h,
                center_position=current_position,
            )
            last_crop = crop
        else:
            # Fallback behavior
            if self.fallback == "center":
                crop = (default_x, default_y, out_w, out_h)
            elif self.fallback == "last_position":
                crop = last_crop
            else:  # full_frame
                crop = (0, 0, w, h)

        x, y, cw, ch = crop
        cropped = frame[y : y + ch, x : x + cw]

        # Resize to output dimensions if needed
        if cropped.shape[1] != out_w or cropped.shape[0] != out_h:
            cropped = cv2.resize(cropped, (out_w, out_h), interpolation=cv2.INTER_AREA)

        new_frames.append(cropped)

    video.frames = np.array(new_frames, dtype=np.uint8)
    return video

SplitScreenComposite

SplitScreenComposite

Bases: Transformation

Arranges multiple face-tracked crops in a grid layout.

Useful for interview-style videos, reaction videos, or showing multiple perspectives simultaneously.

Supports GPU acceleration for faster face tracking.

Source code in src/videopython/ai/transforms.py
class SplitScreenComposite(Transformation):
    """Arranges multiple face-tracked crops in a grid layout.

    Useful for interview-style videos, reaction videos, or showing
    multiple perspectives simultaneously.

    Supports GPU acceleration for faster face tracking.
    """

    def __init__(
        self,
        layout: Literal["2x1", "1x2", "2x2", "1+2", "2+1"] = "2x1",
        output_size: tuple[int, int] | None = None,
        gap: int = 4,
        gap_color: tuple[int, int, int] = (0, 0, 0),
        border_width: int = 0,
        border_color: tuple[int, int, int] = (255, 255, 255),
        face_padding: float = 0.2,
        smoothing: float = 0.8,
        detection_interval: int = 3,
        audio_source: Literal["main", "loudest", "mix"] = "main",
        backend: Literal["cpu", "gpu", "auto"] = "cpu",
        sample_rate: int = 1,
    ):
        """Initialize split screen composite.

        Args:
            layout: Grid layout for cells.
                - "2x1": Two cells side by side (horizontal)
                - "1x2": Two cells stacked (vertical)
                - "2x2": Four cells in 2x2 grid
                - "1+2": One large cell on left, two small on right
                - "2+1": Two small cells on left, one large on right
            output_size: Output dimensions (width, height). If None, use source size.
            gap: Gap between cells in pixels.
            gap_color: Color of gap between cells (RGB).
            border_width: Border width around each cell.
            border_color: Border color (RGB).
            face_padding: Extra space around face in each cell.
            smoothing: Position smoothing factor.
            detection_interval: Frames between face detections.
            audio_source: Audio handling ("main" uses first source).
            backend: Detection backend - "cpu", "gpu", or "auto".
            sample_rate: For GPU backend, detect every Nth frame and interpolate.
        """
        self.layout = layout
        self.output_size = output_size
        self.gap = gap
        self.gap_color = gap_color
        self.border_width = border_width
        self.border_color = border_color
        self.face_padding = face_padding
        self.smoothing = smoothing
        self.detection_interval = detection_interval
        self.audio_source = audio_source
        self.backend: Literal["cpu", "gpu", "auto"] = backend
        self.sample_rate = sample_rate

    def _get_cell_rects(self, width: int, height: int) -> list[tuple[int, int, int, int]]:
        """Calculate cell rectangles for the layout.

        Returns:
            List of (x, y, width, height) tuples for each cell.
        """
        gap = self.gap

        if self.layout == "2x1":
            cell_w = (width - gap) // 2
            return [
                (0, 0, cell_w, height),
                (cell_w + gap, 0, width - cell_w - gap, height),
            ]
        elif self.layout == "1x2":
            cell_h = (height - gap) // 2
            return [
                (0, 0, width, cell_h),
                (0, cell_h + gap, width, height - cell_h - gap),
            ]
        elif self.layout == "2x2":
            cell_w = (width - gap) // 2
            cell_h = (height - gap) // 2
            return [
                (0, 0, cell_w, cell_h),
                (cell_w + gap, 0, width - cell_w - gap, cell_h),
                (0, cell_h + gap, cell_w, height - cell_h - gap),
                (cell_w + gap, cell_h + gap, width - cell_w - gap, height - cell_h - gap),
            ]
        elif self.layout == "1+2":
            # Large cell on left (2/3 width), two small on right (1/3 width)
            large_w = (width - gap) * 2 // 3
            small_w = width - large_w - gap
            small_h = (height - gap) // 2
            return [
                (0, 0, large_w, height),
                (large_w + gap, 0, small_w, small_h),
                (large_w + gap, small_h + gap, small_w, height - small_h - gap),
            ]
        elif self.layout == "2+1":
            # Two small cells on left, large on right
            large_w = (width - gap) * 2 // 3
            small_w = width - large_w - gap
            small_h = (height - gap) // 2
            return [
                (0, 0, small_w, small_h),
                (0, small_h + gap, small_w, height - small_h - gap),
                (small_w + gap, 0, large_w, height),
            ]
        else:
            raise ValueError(f"Unknown layout: {self.layout}")

    def _get_required_sources(self) -> int:
        """Get number of video sources required for the layout."""
        if self.layout in ("2x1", "1x2"):
            return 2
        elif self.layout == "2x2":
            return 4
        else:  # 1+2, 2+1
            return 3

    def apply(self, video: Video, *additional_videos: Video) -> Video:
        """Apply split screen composite to videos.

        Args:
            video: Primary video (used for timing and audio).
            *additional_videos: Additional videos to include in grid.

        Returns:
            Composite video with all sources in grid layout.
        """
        all_videos = [video] + list(additional_videos)
        required = self._get_required_sources()

        if len(all_videos) < required:
            raise ValueError(f"Layout '{self.layout}' requires {required} videos, got {len(all_videos)}")

        # Use first video for timing
        n_frames = len(video.frames)

        # Determine output size
        if self.output_size:
            out_w, out_h = self.output_size
        else:
            out_w, out_h = video.frame_shape[1], video.frame_shape[0]

        # Keep final composite dimensions encoder-friendly by default.
        out_w = _make_even(out_w)
        out_h = _make_even(out_h)

        cell_rects = self._get_cell_rects(out_w, out_h)

        # Create face trackers for each cell
        trackers = [
            FaceTracker(
                selection_strategy="largest",
                smoothing=self.smoothing,
                detection_interval=self.detection_interval,
                backend=self.backend,
                sample_rate=self.sample_rate,
            )
            for _ in range(len(cell_rects))
        ]

        print(f"Creating {self.layout} split screen: {out_w}x{out_h}")

        new_frames = []
        for i in tqdm(range(n_frames), desc="Split screen composite"):
            # Create output frame with gap color
            output = np.full((out_h, out_w, 3), self.gap_color, dtype=np.uint8)

            for cell_idx, (cx, cy, cw, ch) in enumerate(cell_rects):
                if cell_idx >= len(all_videos):
                    break

                src_video = all_videos[cell_idx]
                src_idx = i % len(src_video.frames)
                src_frame = src_video.frames[src_idx]
                src_h, src_w = src_frame.shape[:2]

                # Track face in source
                face_info = trackers[cell_idx].detect_and_track(src_frame, i)

                if face_info:
                    fcx, fcy, fw, fh = face_info
                    # Calculate crop to fit face in cell
                    cell_aspect = cw / ch
                    src_aspect = src_w / src_h

                    if cell_aspect < src_aspect:
                        # Cell is taller - crop width
                        crop_h = _make_even(src_h)
                        crop_w = _make_even(int(crop_h * cell_aspect))
                    else:
                        # Cell is wider - crop height
                        crop_w = _make_even(src_w)
                        crop_h = _make_even(int(crop_w / cell_aspect))

                    # Center on face with padding consideration
                    center_x = int(fcx * src_w)
                    center_y = int(fcy * src_h)

                    crop_x = max(0, min(center_x - crop_w // 2, src_w - crop_w))
                    crop_y = max(0, min(center_y - crop_h // 2, src_h - crop_h))

                    cropped = src_frame[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w]
                else:
                    # Center crop as fallback
                    cell_aspect = cw / ch
                    src_aspect = src_w / src_h

                    if cell_aspect < src_aspect:
                        crop_h = _make_even(src_h)
                        crop_w = _make_even(int(crop_h * cell_aspect))
                        crop_x = (src_w - crop_w) // 2
                        crop_y = 0
                    else:
                        crop_w = _make_even(src_w)
                        crop_h = _make_even(int(crop_w / cell_aspect))
                        crop_x = 0
                        crop_y = (src_h - crop_h) // 2

                    cropped = src_frame[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w]

                # Resize to cell size
                resized = cv2.resize(cropped, (cw, ch), interpolation=cv2.INTER_AREA)

                # Apply border if specified
                if self.border_width > 0:
                    bw = self.border_width
                    resized[:bw, :] = self.border_color
                    resized[-bw:, :] = self.border_color
                    resized[:, :bw] = self.border_color
                    resized[:, -bw:] = self.border_color

                # Place in output
                output[cy : cy + ch, cx : cx + cw] = resized

            new_frames.append(output)

        video.frames = np.array(new_frames, dtype=np.uint8)

        # Audio handling - keep main video audio
        # (mixing multiple audio tracks would require more complex handling)
        return video

__init__

__init__(
    layout: Literal[
        "2x1", "1x2", "2x2", "1+2", "2+1"
    ] = "2x1",
    output_size: tuple[int, int] | None = None,
    gap: int = 4,
    gap_color: tuple[int, int, int] = (0, 0, 0),
    border_width: int = 0,
    border_color: tuple[int, int, int] = (255, 255, 255),
    face_padding: float = 0.2,
    smoothing: float = 0.8,
    detection_interval: int = 3,
    audio_source: Literal[
        "main", "loudest", "mix"
    ] = "main",
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
)

Initialize split screen composite.

Parameters:

Name Type Description Default
layout Literal['2x1', '1x2', '2x2', '1+2', '2+1']

Grid layout for cells. - "2x1": Two cells side by side (horizontal) - "1x2": Two cells stacked (vertical) - "2x2": Four cells in 2x2 grid - "1+2": One large cell on left, two small on right - "2+1": Two small cells on left, one large on right

'2x1'
output_size tuple[int, int] | None

Output dimensions (width, height). If None, use source size.

None
gap int

Gap between cells in pixels.

4
gap_color tuple[int, int, int]

Color of gap between cells (RGB).

(0, 0, 0)
border_width int

Border width around each cell.

0
border_color tuple[int, int, int]

Border color (RGB).

(255, 255, 255)
face_padding float

Extra space around face in each cell.

0.2
smoothing float

Position smoothing factor.

0.8
detection_interval int

Frames between face detections.

3
audio_source Literal['main', 'loudest', 'mix']

Audio handling ("main" uses first source).

'main'
backend Literal['cpu', 'gpu', 'auto']

Detection backend - "cpu", "gpu", or "auto".

'cpu'
sample_rate int

For GPU backend, detect every Nth frame and interpolate.

1
Source code in src/videopython/ai/transforms.py
def __init__(
    self,
    layout: Literal["2x1", "1x2", "2x2", "1+2", "2+1"] = "2x1",
    output_size: tuple[int, int] | None = None,
    gap: int = 4,
    gap_color: tuple[int, int, int] = (0, 0, 0),
    border_width: int = 0,
    border_color: tuple[int, int, int] = (255, 255, 255),
    face_padding: float = 0.2,
    smoothing: float = 0.8,
    detection_interval: int = 3,
    audio_source: Literal["main", "loudest", "mix"] = "main",
    backend: Literal["cpu", "gpu", "auto"] = "cpu",
    sample_rate: int = 1,
):
    """Initialize split screen composite.

    Args:
        layout: Grid layout for cells.
            - "2x1": Two cells side by side (horizontal)
            - "1x2": Two cells stacked (vertical)
            - "2x2": Four cells in 2x2 grid
            - "1+2": One large cell on left, two small on right
            - "2+1": Two small cells on left, one large on right
        output_size: Output dimensions (width, height). If None, use source size.
        gap: Gap between cells in pixels.
        gap_color: Color of gap between cells (RGB).
        border_width: Border width around each cell.
        border_color: Border color (RGB).
        face_padding: Extra space around face in each cell.
        smoothing: Position smoothing factor.
        detection_interval: Frames between face detections.
        audio_source: Audio handling ("main" uses first source).
        backend: Detection backend - "cpu", "gpu", or "auto".
        sample_rate: For GPU backend, detect every Nth frame and interpolate.
    """
    self.layout = layout
    self.output_size = output_size
    self.gap = gap
    self.gap_color = gap_color
    self.border_width = border_width
    self.border_color = border_color
    self.face_padding = face_padding
    self.smoothing = smoothing
    self.detection_interval = detection_interval
    self.audio_source = audio_source
    self.backend: Literal["cpu", "gpu", "auto"] = backend
    self.sample_rate = sample_rate

apply

apply(video: Video, *additional_videos: Video) -> Video

Apply split screen composite to videos.

Parameters:

Name Type Description Default
video Video

Primary video (used for timing and audio).

required
*additional_videos Video

Additional videos to include in grid.

()

Returns:

Type Description
Video

Composite video with all sources in grid layout.

Source code in src/videopython/ai/transforms.py
def apply(self, video: Video, *additional_videos: Video) -> Video:
    """Apply split screen composite to videos.

    Args:
        video: Primary video (used for timing and audio).
        *additional_videos: Additional videos to include in grid.

    Returns:
        Composite video with all sources in grid layout.
    """
    all_videos = [video] + list(additional_videos)
    required = self._get_required_sources()

    if len(all_videos) < required:
        raise ValueError(f"Layout '{self.layout}' requires {required} videos, got {len(all_videos)}")

    # Use first video for timing
    n_frames = len(video.frames)

    # Determine output size
    if self.output_size:
        out_w, out_h = self.output_size
    else:
        out_w, out_h = video.frame_shape[1], video.frame_shape[0]

    # Keep final composite dimensions encoder-friendly by default.
    out_w = _make_even(out_w)
    out_h = _make_even(out_h)

    cell_rects = self._get_cell_rects(out_w, out_h)

    # Create face trackers for each cell
    trackers = [
        FaceTracker(
            selection_strategy="largest",
            smoothing=self.smoothing,
            detection_interval=self.detection_interval,
            backend=self.backend,
            sample_rate=self.sample_rate,
        )
        for _ in range(len(cell_rects))
    ]

    print(f"Creating {self.layout} split screen: {out_w}x{out_h}")

    new_frames = []
    for i in tqdm(range(n_frames), desc="Split screen composite"):
        # Create output frame with gap color
        output = np.full((out_h, out_w, 3), self.gap_color, dtype=np.uint8)

        for cell_idx, (cx, cy, cw, ch) in enumerate(cell_rects):
            if cell_idx >= len(all_videos):
                break

            src_video = all_videos[cell_idx]
            src_idx = i % len(src_video.frames)
            src_frame = src_video.frames[src_idx]
            src_h, src_w = src_frame.shape[:2]

            # Track face in source
            face_info = trackers[cell_idx].detect_and_track(src_frame, i)

            if face_info:
                fcx, fcy, fw, fh = face_info
                # Calculate crop to fit face in cell
                cell_aspect = cw / ch
                src_aspect = src_w / src_h

                if cell_aspect < src_aspect:
                    # Cell is taller - crop width
                    crop_h = _make_even(src_h)
                    crop_w = _make_even(int(crop_h * cell_aspect))
                else:
                    # Cell is wider - crop height
                    crop_w = _make_even(src_w)
                    crop_h = _make_even(int(crop_w / cell_aspect))

                # Center on face with padding consideration
                center_x = int(fcx * src_w)
                center_y = int(fcy * src_h)

                crop_x = max(0, min(center_x - crop_w // 2, src_w - crop_w))
                crop_y = max(0, min(center_y - crop_h // 2, src_h - crop_h))

                cropped = src_frame[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w]
            else:
                # Center crop as fallback
                cell_aspect = cw / ch
                src_aspect = src_w / src_h

                if cell_aspect < src_aspect:
                    crop_h = _make_even(src_h)
                    crop_w = _make_even(int(crop_h * cell_aspect))
                    crop_x = (src_w - crop_w) // 2
                    crop_y = 0
                else:
                    crop_w = _make_even(src_w)
                    crop_h = _make_even(int(crop_w / cell_aspect))
                    crop_x = 0
                    crop_y = (src_h - crop_h) // 2

                cropped = src_frame[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w]

            # Resize to cell size
            resized = cv2.resize(cropped, (cw, ch), interpolation=cv2.INTER_AREA)

            # Apply border if specified
            if self.border_width > 0:
                bw = self.border_width
                resized[:bw, :] = self.border_color
                resized[-bw:, :] = self.border_color
                resized[:, :bw] = self.border_color
                resized[:, -bw:] = self.border_color

            # Place in output
            output[cy : cy + ch, cx : cx + cw] = resized

        new_frames.append(output)

    video.frames = np.array(new_frames, dtype=np.uint8)

    # Audio handling - keep main video audio
    # (mixing multiple audio tracks would require more complex handling)
    return video