AI Object Swapping

Replace, remove, or modify objects in videos using AI-powered segmentation and inpainting.

Local Pipeline

Object swapping uses local SAM2 + GroundingDINO segmentation with SDXL inpainting/compositing.

ObjectSwapper

Main class for object manipulation in videos.

Swap Object with Generated Content

Replace an object with AI-generated content from a text prompt:

from videopython.base import Video
from videopython.ai import ObjectSwapper

video = Video.from_path("street.mp4")
swapper = ObjectSwapper()

# Replace red car with a blue motorcycle
result = swapper.swap(
    video=video,
    source_object="red car",
    target_object="blue motorcycle",
)

# Create video from swapped frames
swapped_video = Video.from_frames(result.swapped_frames, video.fps)
swapped_video.save("swapped.mp4")

Swap Object with Image

Replace an object with a provided image:

result = swapper.swap_with_image(
    video=video,
    source_object="red car",
    replacement_image="motorcycle.png",
)

Remove Object

Remove an object and fill with background:

result = swapper.remove_object(
    video=video,
    object_prompt="red car",
)

Segment Only

Get object masks without modifying the video:

track = swapper.segment_only(
    video=video,
    object_prompt="person",
)

print(f"Tracked {len(track.masks)} frames")
for mask in track.masks:
    print(f"Frame {mask.frame_index}: confidence {mask.confidence:.2f}")

Visualize Tracking

Debug visualization of tracked object:

debug_frames = swapper.visualize_track(video, track)
debug_video = Video.from_frames(debug_frames, video.fps)
debug_video.save("debug_tracking.mp4")

Progress Tracking

def on_progress(stage: str, progress: float) -> None:
    print(f"[{progress*100:5.1f}%] {stage}")

result = swapper.swap(
    video=video,
    source_object="red car",
    target_object="blue motorcycle",
    progress_callback=on_progress,
)

ObjectSwapper

Swaps objects in videos using segmentation, inpainting, and compositing.

The object swapping pipeline: 1. Segment source object using SAM2 (track across frames) 2. Inpaint background where object was removed 3. Composite replacement (generated or provided image) into cleaned background

Example

from videopython.base.video import Video from videopython.ai.swapping import ObjectSwapper

video = Video.from_path("street.mp4") swapper = ObjectSwapper()

Option A: Generate replacement from prompt

result = swapper.swap(video, source_object="red car", target_object="blue motorcycle")

Option B: Use provided image

result = swapper.swap_with_image( ... video, source_object="red car", replacement_image="bike.png" ... )

Get result

swapped_video = Video.from_frames(result.swapped_frames, video.fps)

Source code in src/videopython/ai/swapping/swapper.py

class ObjectSwapper:
    """Swaps objects in videos using segmentation, inpainting, and compositing.

    The object swapping pipeline:
    1. Segment source object using SAM2 (track across frames)
    2. Inpaint background where object was removed
    3. Composite replacement (generated or provided image) into cleaned background

    Example:
        >>> from videopython.base.video import Video
        >>> from videopython.ai.swapping import ObjectSwapper
        >>>
        >>> video = Video.from_path("street.mp4")
        >>> swapper = ObjectSwapper()
        >>>
        >>> # Option A: Generate replacement from prompt
        >>> result = swapper.swap(video, source_object="red car", target_object="blue motorcycle")
        >>>
        >>> # Option B: Use provided image
        >>> result = swapper.swap_with_image(
        ...     video, source_object="red car", replacement_image="bike.png"
        ... )
        >>>
        >>> # Get result
        >>> swapped_video = Video.from_frames(result.swapped_frames, video.fps)
    """

    def __init__(
        self,
        config: SwapConfig | None = None,
        device: str | None = None,
    ):
        """Initialize the object swapper.

        Args:
            config: Configuration for the swapping pipeline.
            device: Device for local models ('cuda', 'mps', or 'cpu').
        """
        self.config = config or SwapConfig()
        self.device = device

        # Lazy-loaded components
        self._segmenter: ObjectSegmenter | None = None
        self._inpainter: VideoInpainter | None = None
        self._image_generator: Any = None

    def _get_segmenter(self) -> ObjectSegmenter:
        """Get or create the object segmenter."""
        if self._segmenter is None:
            self._segmenter = ObjectSegmenter(
                config=self.config.segmentation,
                device=self.device,
            )
        return self._segmenter

    def _get_inpainter(self) -> VideoInpainter:
        """Get or create the video inpainter."""
        if self._inpainter is None:
            self._inpainter = VideoInpainter(
                config=self.config.inpainting,
                device=self.device,
            )
        return self._inpainter

    def _get_image_generator(self) -> Any:
        """Get or create the image generator for target object generation."""
        if self._image_generator is None:
            from videopython.ai.generation import TextToImage

            self._image_generator = TextToImage()
        return self._image_generator

    def _generate_replacement_image(
        self,
        target_prompt: str,
        width: int,
        height: int,
    ) -> np.ndarray:
        """Generate a replacement image from a text prompt.

        Args:
            target_prompt: Text description of replacement object.
            width: Target width.
            height: Target height.

        Returns:
            Generated image as RGB array.
        """
        generator = self._get_image_generator()
        from PIL import Image

        image = generator.generate_image(target_prompt)
        image = image.resize((width, height), Image.Resampling.LANCZOS)

        return np.array(image)

    def _load_replacement_image(self, image_path: str | Path) -> np.ndarray:
        """Load a replacement image from file.

        Args:
            image_path: Path to the replacement image.

        Returns:
            Image as RGB array.
        """
        from PIL import Image

        image = Image.open(image_path).convert("RGB")
        return np.array(image)

    def _composite_replacement(
        self,
        background: np.ndarray,
        replacement: np.ndarray,
        mask: np.ndarray,
        blend_factor: float = 0.5,
    ) -> np.ndarray:
        """Composite replacement image onto background using mask.

        Args:
            background: Background frame of shape (H, W, C).
            replacement: Replacement image of shape (H, W, C).
            mask: Binary mask of shape (H, W) indicating replacement region.
            blend_factor: Edge blending factor (0=hard, 1=soft).

        Returns:
            Composited frame of shape (H, W, C).
        """
        import cv2

        # Resize replacement to match background
        h, w = background.shape[:2]
        replacement_resized = cv2.resize(replacement, (w, h), interpolation=cv2.INTER_LINEAR)

        # Create soft edge mask if blend_factor > 0
        if blend_factor > 0:
            # Blur mask edges
            blur_size = max(3, int(min(h, w) * blend_factor * 0.05))
            if blur_size % 2 == 0:
                blur_size += 1
            soft_mask = cv2.GaussianBlur(mask.astype(np.float32), (blur_size, blur_size), 0)  # type: ignore[type-var]
        else:
            soft_mask = mask.astype(np.float32)

        # Expand mask to 3 channels
        soft_mask_3d = soft_mask[:, :, np.newaxis]

        # Composite: result = bg * (1 - mask) + replacement * mask
        result = background * (1 - soft_mask_3d) + replacement_resized * soft_mask_3d

        return result.astype(np.uint8)

    def _composite_video(
        self,
        inpainted_frames: np.ndarray,
        replacement: np.ndarray,
        track: Any,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> np.ndarray:
        """Composite replacement onto all video frames.

        Args:
            inpainted_frames: Background frames with object removed.
            replacement: Replacement image.
            track: Object track with masks.
            progress_callback: Progress callback.

        Returns:
            Composited video frames.
        """
        num_frames = inpainted_frames.shape[0]
        composited = []

        for i in range(num_frames):
            if progress_callback:
                progress_callback("Compositing frames", i / num_frames)

            mask_obj = track.get_mask_for_frame(i)
            if mask_obj is None or mask_obj.area == 0:
                # No mask, keep inpainted frame
                composited.append(inpainted_frames[i])
                continue

            # Composite replacement
            frame = self._composite_replacement(
                background=inpainted_frames[i],
                replacement=replacement,
                mask=mask_obj.mask,
                blend_factor=self.config.composite_blend,
            )
            composited.append(frame)

        if progress_callback:
            progress_callback("Compositing complete", 1.0)

        return np.stack(composited, axis=0)

    def swap(
        self,
        video: Video,
        source_object: str,
        target_object: str,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Swap an object in video with a generated replacement.

        Segments the source object, removes it via inpainting, and composites
        a generated replacement image based on the target prompt.

        Args:
            video: Input video to process.
            source_object: Text description of object to replace (e.g., "red car").
            target_object: Text description of replacement object (e.g., "blue motorcycle").
            reference_frame: Frame index for initial segmentation. Default: config value.
            progress_callback: Optional callback for progress updates.
                Called with (stage_name, progress_fraction).

        Returns:
            SwapResult containing swapped frames and metadata.

        Example:
            >>> result = swapper.swap(video, "person", "robot")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment source object
        if progress_callback:
            progress_callback("Segmenting source object", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
        )

        # Stage 2: Inpaint background
        if progress_callback:
            progress_callback("Inpainting background", 0.3)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt="background, seamless, natural",
            progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.3) if progress_callback else None,
        )

        # Stage 3: Generate replacement image
        if progress_callback:
            progress_callback("Generating replacement object", 0.6)

        h, w = frames.shape[1:3]
        replacement = self._generate_replacement_image(
            target_prompt=target_object,
            width=w,
            height=h,
        )

        # Stage 4: Composite replacement
        if progress_callback:
            progress_callback("Compositing frames", 0.7)

        swapped_frames = self._composite_video(
            inpainted_frames=inpainted_frames,
            replacement=replacement,
            track=track,
            progress_callback=lambda msg, p: progress_callback(msg, 0.7 + p * 0.3) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=swapped_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
            target_prompt=target_object,
        )

    def swap_with_image(
        self,
        video: Video,
        source_object: str,
        replacement_image: str | Path,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Swap an object in video with a provided replacement image.

        Segments the source object, removes it via inpainting, and composites
        the provided replacement image in its place.

        Args:
            video: Input video to process.
            source_object: Text description of object to replace (e.g., "red car").
            replacement_image: Path to replacement image file.
            reference_frame: Frame index for initial segmentation. Default: config value.
            progress_callback: Optional callback for progress updates.

        Returns:
            SwapResult containing swapped frames and metadata.

        Example:
            >>> result = swapper.swap_with_image(video, "logo", "new_logo.png")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment source object
        if progress_callback:
            progress_callback("Segmenting source object", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
        )

        # Stage 2: Inpaint background
        if progress_callback:
            progress_callback("Inpainting background", 0.3)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt="background, seamless, natural",
            progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.4) if progress_callback else None,
        )

        # Stage 3: Load replacement image
        if progress_callback:
            progress_callback("Loading replacement image", 0.7)

        replacement = self._load_replacement_image(replacement_image)

        # Stage 4: Composite replacement
        if progress_callback:
            progress_callback("Compositing frames", 0.75)

        swapped_frames = self._composite_video(
            inpainted_frames=inpainted_frames,
            replacement=replacement,
            track=track,
            progress_callback=lambda msg, p: progress_callback(msg, 0.75 + p * 0.25) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=swapped_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
            replacement_image=str(replacement_image),
        )

    def remove_object(
        self,
        video: Video,
        source_object: str,
        reference_frame: int | None = None,
        inpaint_prompt: str = "background, seamless, natural",
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Remove an object from video without replacement.

        Segments the object and inpaints the background to remove it cleanly.

        Args:
            video: Input video to process.
            source_object: Text description of object to remove.
            reference_frame: Frame index for initial segmentation.
            inpaint_prompt: Prompt to guide background generation.
            progress_callback: Optional progress callback.

        Returns:
            SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

        Example:
            >>> result = swapper.remove_object(video, "watermark")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment object
        if progress_callback:
            progress_callback("Segmenting object to remove", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.4) if progress_callback else None,
        )

        # Stage 2: Inpaint to remove
        if progress_callback:
            progress_callback("Removing object", 0.4)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt=inpaint_prompt,
            progress_callback=lambda msg, p: progress_callback(msg, 0.4 + p * 0.6) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=inpainted_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
        )

    def segment_only(
        self,
        video: Video,
        source_object: str,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Segment an object without swapping or inpainting.

        Useful for previewing segmentation results before full processing.

        Args:
            video: Input video to process.
            source_object: Text description of object to segment.
            reference_frame: Frame index for initial segmentation.
            progress_callback: Optional progress callback.

        Returns:
            SwapResult with original frames and object track (no swapping performed).
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=progress_callback,
        )

        return SwapResult(
            swapped_frames=frames.copy(),
            object_track=track,
            source_prompt=source_object,
        )

    @staticmethod
    def visualize_track(
        frames: np.ndarray,
        track: Any,
        color: tuple[int, int, int] = (255, 0, 0),
        alpha: float = 0.5,
    ) -> np.ndarray:
        """Overlay object masks on video frames for visualization.

        Args:
            frames: Video frames array of shape (N, H, W, C).
            track: ObjectTrack to visualize.
            color: RGB color for mask overlay.
            alpha: Opacity of mask overlay (0-1).

        Returns:
            Frames with mask overlay.
        """
        visualized = frames.copy()
        overlay_color = np.array(color, dtype=np.float32)

        for i in range(frames.shape[0]):
            mask_obj = track.get_mask_for_frame(i)
            if mask_obj is None or mask_obj.area == 0:
                continue

            mask = mask_obj.mask
            mask_3d = mask[:, :, np.newaxis]

            # Blend original with colored overlay in masked region
            original = visualized[i].astype(np.float32)
            colored = original * (1 - alpha) + overlay_color * alpha
            visualized[i] = np.where(mask_3d, colored, original).astype(np.uint8)

        return visualized

init

__init__(
    config: SwapConfig | None = None,
    device: str | None = None,
)

Initialize the object swapper.

Parameters:

Name	Type	Description	Default
`config`	`SwapConfig \| None`	Configuration for the swapping pipeline.	`None`
`device`	`str \| None`	Device for local models ('cuda', 'mps', or 'cpu').	`None`

Source code in src/videopython/ai/swapping/swapper.py

def __init__(
    self,
    config: SwapConfig | None = None,
    device: str | None = None,
):
    """Initialize the object swapper.

    Args:
        config: Configuration for the swapping pipeline.
        device: Device for local models ('cuda', 'mps', or 'cpu').
    """
    self.config = config or SwapConfig()
    self.device = device

    # Lazy-loaded components
    self._segmenter: ObjectSegmenter | None = None
    self._inpainter: VideoInpainter | None = None
    self._image_generator: Any = None

swap

swap(
    video: Video,
    source_object: str,
    target_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Swap an object in video with a generated replacement.

Segments the source object, removes it via inpainting, and composites a generated replacement image based on the target prompt.

Parameters:

Name	Type	Description	Default
`video`	`Video`	Input video to process.	required
`source_object`	`str`	Text description of object to replace (e.g., "red car").	required
`target_object`	`str`	Text description of replacement object (e.g., "blue motorcycle").	required
`reference_frame`	`int \| None`	Frame index for initial segmentation. Default: config value.	`None`
`progress_callback`	`Callable[[str, float], None] \| None`	Optional callback for progress updates. Called with (stage_name, progress_fraction).	`None`

Returns:

Type	Description
`SwapResult`	SwapResult containing swapped frames and metadata.

Example

result = swapper.swap(video, "person", "robot") Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")

Source code in src/videopython/ai/swapping/swapper.py

def swap(
    self,
    video: Video,
    source_object: str,
    target_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Swap an object in video with a generated replacement.

    Segments the source object, removes it via inpainting, and composites
    a generated replacement image based on the target prompt.

    Args:
        video: Input video to process.
        source_object: Text description of object to replace (e.g., "red car").
        target_object: Text description of replacement object (e.g., "blue motorcycle").
        reference_frame: Frame index for initial segmentation. Default: config value.
        progress_callback: Optional callback for progress updates.
            Called with (stage_name, progress_fraction).

    Returns:
        SwapResult containing swapped frames and metadata.

    Example:
        >>> result = swapper.swap(video, "person", "robot")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment source object
    if progress_callback:
        progress_callback("Segmenting source object", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
    )

    # Stage 2: Inpaint background
    if progress_callback:
        progress_callback("Inpainting background", 0.3)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt="background, seamless, natural",
        progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.3) if progress_callback else None,
    )

    # Stage 3: Generate replacement image
    if progress_callback:
        progress_callback("Generating replacement object", 0.6)

    h, w = frames.shape[1:3]
    replacement = self._generate_replacement_image(
        target_prompt=target_object,
        width=w,
        height=h,
    )

    # Stage 4: Composite replacement
    if progress_callback:
        progress_callback("Compositing frames", 0.7)

    swapped_frames = self._composite_video(
        inpainted_frames=inpainted_frames,
        replacement=replacement,
        track=track,
        progress_callback=lambda msg, p: progress_callback(msg, 0.7 + p * 0.3) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=swapped_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
        target_prompt=target_object,
    )

swap_with_image

swap_with_image(
    video: Video,
    source_object: str,
    replacement_image: str | Path,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Swap an object in video with a provided replacement image.

Segments the source object, removes it via inpainting, and composites the provided replacement image in its place.

Parameters:

Name	Type	Description	Default
`video`	`Video`	Input video to process.	required
`source_object`	`str`	Text description of object to replace (e.g., "red car").	required
`replacement_image`	`str \| Path`	Path to replacement image file.	required
`reference_frame`	`int \| None`	Frame index for initial segmentation. Default: config value.	`None`
`progress_callback`	`Callable[[str, float], None] \| None`	Optional callback for progress updates.	`None`

Returns:

Type	Description
`SwapResult`	SwapResult containing swapped frames and metadata.

Example

result = swapper.swap_with_image(video, "logo", "new_logo.png") Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")

Source code in src/videopython/ai/swapping/swapper.py

def swap_with_image(
    self,
    video: Video,
    source_object: str,
    replacement_image: str | Path,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Swap an object in video with a provided replacement image.

    Segments the source object, removes it via inpainting, and composites
    the provided replacement image in its place.

    Args:
        video: Input video to process.
        source_object: Text description of object to replace (e.g., "red car").
        replacement_image: Path to replacement image file.
        reference_frame: Frame index for initial segmentation. Default: config value.
        progress_callback: Optional callback for progress updates.

    Returns:
        SwapResult containing swapped frames and metadata.

    Example:
        >>> result = swapper.swap_with_image(video, "logo", "new_logo.png")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment source object
    if progress_callback:
        progress_callback("Segmenting source object", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
    )

    # Stage 2: Inpaint background
    if progress_callback:
        progress_callback("Inpainting background", 0.3)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt="background, seamless, natural",
        progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.4) if progress_callback else None,
    )

    # Stage 3: Load replacement image
    if progress_callback:
        progress_callback("Loading replacement image", 0.7)

    replacement = self._load_replacement_image(replacement_image)

    # Stage 4: Composite replacement
    if progress_callback:
        progress_callback("Compositing frames", 0.75)

    swapped_frames = self._composite_video(
        inpainted_frames=inpainted_frames,
        replacement=replacement,
        track=track,
        progress_callback=lambda msg, p: progress_callback(msg, 0.75 + p * 0.25) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=swapped_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
        replacement_image=str(replacement_image),
    )

remove_object

remove_object(
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    inpaint_prompt: str = "background, seamless, natural",
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Remove an object from video without replacement.

Segments the object and inpaints the background to remove it cleanly.

Parameters:

Name	Type	Description	Default
`video`	`Video`	Input video to process.	required
`source_object`	`str`	Text description of object to remove.	required
`reference_frame`	`int \| None`	Frame index for initial segmentation.	`None`
`inpaint_prompt`	`str`	Prompt to guide background generation.	`'background, seamless, natural'`
`progress_callback`	`Callable[[str, float], None] \| None`	Optional progress callback.	`None`

Returns:

Type	Description
`SwapResult`	SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

Example

result = swapper.remove_object(video, "watermark") Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")

Source code in src/videopython/ai/swapping/swapper.py

def remove_object(
    self,
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    inpaint_prompt: str = "background, seamless, natural",
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Remove an object from video without replacement.

    Segments the object and inpaints the background to remove it cleanly.

    Args:
        video: Input video to process.
        source_object: Text description of object to remove.
        reference_frame: Frame index for initial segmentation.
        inpaint_prompt: Prompt to guide background generation.
        progress_callback: Optional progress callback.

    Returns:
        SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

    Example:
        >>> result = swapper.remove_object(video, "watermark")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment object
    if progress_callback:
        progress_callback("Segmenting object to remove", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.4) if progress_callback else None,
    )

    # Stage 2: Inpaint to remove
    if progress_callback:
        progress_callback("Removing object", 0.4)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt=inpaint_prompt,
        progress_callback=lambda msg, p: progress_callback(msg, 0.4 + p * 0.6) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=inpainted_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
    )

segment_only

segment_only(
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Segment an object without swapping or inpainting.

Useful for previewing segmentation results before full processing.

Parameters:

Name	Type	Description	Default
`video`	`Video`	Input video to process.	required
`source_object`	`str`	Text description of object to segment.	required
`reference_frame`	`int \| None`	Frame index for initial segmentation.	`None`
`progress_callback`	`Callable[[str, float], None] \| None`	Optional progress callback.	`None`

Returns:

Type	Description
`SwapResult`	SwapResult with original frames and object track (no swapping performed).

Source code in src/videopython/ai/swapping/swapper.py

def segment_only(
    self,
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Segment an object without swapping or inpainting.

    Useful for previewing segmentation results before full processing.

    Args:
        video: Input video to process.
        source_object: Text description of object to segment.
        reference_frame: Frame index for initial segmentation.
        progress_callback: Optional progress callback.

    Returns:
        SwapResult with original frames and object track (no swapping performed).
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=progress_callback,
    )

    return SwapResult(
        swapped_frames=frames.copy(),
        object_track=track,
        source_prompt=source_object,
    )

visualize_track `staticmethod`

visualize_track(
    frames: ndarray,
    track: Any,
    color: tuple[int, int, int] = (255, 0, 0),
    alpha: float = 0.5,
) -> np.ndarray

Overlay object masks on video frames for visualization.

Parameters:

Name	Type	Description	Default
`frames`	`ndarray`	Video frames array of shape (N, H, W, C).	required
`track`	`Any`	ObjectTrack to visualize.	required
`color`	`tuple[int, int, int]`	RGB color for mask overlay.	`(255, 0, 0)`
`alpha`	`float`	Opacity of mask overlay (0-1).	`0.5`

Returns:

Type	Description
`ndarray`	Frames with mask overlay.

Source code in src/videopython/ai/swapping/swapper.py

@staticmethod
def visualize_track(
    frames: np.ndarray,
    track: Any,
    color: tuple[int, int, int] = (255, 0, 0),
    alpha: float = 0.5,
) -> np.ndarray:
    """Overlay object masks on video frames for visualization.

    Args:
        frames: Video frames array of shape (N, H, W, C).
        track: ObjectTrack to visualize.
        color: RGB color for mask overlay.
        alpha: Opacity of mask overlay (0-1).

    Returns:
        Frames with mask overlay.
    """
    visualized = frames.copy()
    overlay_color = np.array(color, dtype=np.float32)

    for i in range(frames.shape[0]):
        mask_obj = track.get_mask_for_frame(i)
        if mask_obj is None or mask_obj.area == 0:
            continue

        mask = mask_obj.mask
        mask_3d = mask[:, :, np.newaxis]

        # Blend original with colored overlay in masked region
        original = visualized[i].astype(np.float32)
        colored = original * (1 - alpha) + overlay_color * alpha
        visualized[i] = np.where(mask_3d, colored, original).astype(np.uint8)

    return visualized

SwapResult

Result of a swap or remove operation.

result = swapper.swap(video, "car", "truck")

print(f"Processed {len(result.swapped_frames)} frames")
print(f"Object tracked: {result.source_object}")
print(f"Track confidence: {result.track.masks[0].confidence:.2f}")

SwapResult `dataclass`

Result of an object swapping operation.

Attributes:

Name	Type	Description
`swapped_frames`	`ndarray`	Array of frames with object swapped, shape (N, H, W, C).
`object_track`	`ObjectTrack`	The tracked object that was swapped.
`inpainted_frames`	`ndarray \| None`	Frames with object removed (background only), shape (N, H, W, C).
`source_prompt`	`str`	Text prompt used to identify source object.
`target_prompt`	`str`	Text prompt for the replacement object (if generated).
`replacement_image`	`str \| None`	Path to replacement image (if provided).

Source code in src/videopython/ai/swapping/models.py

@dataclass
class SwapResult:
    """Result of an object swapping operation.

    Attributes:
        swapped_frames: Array of frames with object swapped, shape (N, H, W, C).
        object_track: The tracked object that was swapped.
        inpainted_frames: Frames with object removed (background only), shape (N, H, W, C).
        source_prompt: Text prompt used to identify source object.
        target_prompt: Text prompt for the replacement object (if generated).
        replacement_image: Path to replacement image (if provided).
    """

    swapped_frames: np.ndarray
    object_track: ObjectTrack
    inpainted_frames: np.ndarray | None = None
    source_prompt: str = ""
    target_prompt: str = ""
    replacement_image: str | None = None

    @property
    def num_frames(self) -> int:
        """Number of frames in the result."""
        return self.swapped_frames.shape[0]

    @property
    def frame_size(self) -> tuple[int, int]:
        """Size of frames as (height, width)."""
        return (self.swapped_frames.shape[1], self.swapped_frames.shape[2])

    @property
    def has_inpainted_frames(self) -> bool:
        """Check if inpainted frames are available."""
        return self.inpainted_frames is not None

num_frames `property`

num_frames: int

Number of frames in the result.

frame_size `property`

frame_size: tuple[int, int]

Size of frames as (height, width).

has_inpainted_frames `property`

has_inpainted_frames: bool

Check if inpainted frames are available.

ObjectTrack

Tracked object across multiple frames.

ObjectTrack `dataclass`

A tracked object across multiple frames.

Attributes:

Name	Type	Description
`object_id`	`str`	Unique identifier for this tracked object.
`masks`	`list[ObjectMask]`	List of ObjectMask instances for each frame where object appears.
`label`	`str`	Text label describing the object (e.g., "red car").
`start_frame`	`int`	First frame index where object appears.
`end_frame`	`int`	Last frame index where object appears.

Source code in src/videopython/ai/swapping/models.py

@dataclass
class ObjectTrack:
    """A tracked object across multiple frames.

    Attributes:
        object_id: Unique identifier for this tracked object.
        masks: List of ObjectMask instances for each frame where object appears.
        label: Text label describing the object (e.g., "red car").
        start_frame: First frame index where object appears.
        end_frame: Last frame index where object appears.
    """

    object_id: str
    masks: list[ObjectMask]
    label: str
    start_frame: int
    end_frame: int

    @property
    def num_frames(self) -> int:
        """Number of frames this object appears in."""
        return len(self.masks)

    @property
    def frame_indices(self) -> list[int]:
        """List of frame indices where object appears."""
        return [m.frame_index for m in self.masks]

    @property
    def average_confidence(self) -> float:
        """Average confidence across all masks."""
        if not self.masks:
            return 0.0
        return sum(m.confidence for m in self.masks) / len(self.masks)

    def get_mask_for_frame(self, frame_index: int) -> ObjectMask | None:
        """Get the mask for a specific frame.

        Args:
            frame_index: The frame index to look up.

        Returns:
            The ObjectMask for that frame, or None if not present.
        """
        for mask in self.masks:
            if mask.frame_index == frame_index:
                return mask
        return None

    def get_masks_array(self) -> np.ndarray:
        """Get all masks as a stacked numpy array.

        Returns:
            Array of shape (N, H, W) where N is number of frames.
        """
        if not self.masks:
            raise ValueError("No masks in track")
        return np.stack([m.mask for m in self.masks], axis=0)

num_frames `property`

num_frames: int

Number of frames this object appears in.

frame_indices `property`

frame_indices: list[int]

List of frame indices where object appears.

average_confidence `property`

average_confidence: float

Average confidence across all masks.

get_mask_for_frame

get_mask_for_frame(frame_index: int) -> ObjectMask | None

Get the mask for a specific frame.

Parameters:

Name	Type	Description	Default
`frame_index`	`int`	The frame index to look up.	required

Returns:

Type	Description
`ObjectMask \| None`	The ObjectMask for that frame, or None if not present.

Source code in src/videopython/ai/swapping/models.py

def get_mask_for_frame(self, frame_index: int) -> ObjectMask | None:
    """Get the mask for a specific frame.

    Args:
        frame_index: The frame index to look up.

    Returns:
        The ObjectMask for that frame, or None if not present.
    """
    for mask in self.masks:
        if mask.frame_index == frame_index:
            return mask
    return None

get_masks_array

get_masks_array() -> np.ndarray

Get all masks as a stacked numpy array.

Returns:

Type	Description
`ndarray`	Array of shape (N, H, W) where N is number of frames.

Source code in src/videopython/ai/swapping/models.py

def get_masks_array(self) -> np.ndarray:
    """Get all masks as a stacked numpy array.

    Returns:
        Array of shape (N, H, W) where N is number of frames.
    """
    if not self.masks:
        raise ValueError("No masks in track")
    return np.stack([m.mask for m in self.masks], axis=0)

ObjectMask

Single-frame object mask with confidence and bounding box.

ObjectMask `dataclass`

A mask representing an object in a single frame.

Attributes:

Name	Type	Description
`frame_index`	`int`	Index of the frame this mask belongs to.
`mask`	`ndarray`	Binary mask array of shape (H, W) where True indicates object pixels.
`confidence`	`float`	Confidence score of the segmentation (0.0 to 1.0).
`bounding_box`	`tuple[float, float, float, float] \| None`	Optional bounding box as (x1, y1, x2, y2) normalized coordinates.

Source code in src/videopython/ai/swapping/models.py

@dataclass
class ObjectMask:
    """A mask representing an object in a single frame.

    Attributes:
        frame_index: Index of the frame this mask belongs to.
        mask: Binary mask array of shape (H, W) where True indicates object pixels.
        confidence: Confidence score of the segmentation (0.0 to 1.0).
        bounding_box: Optional bounding box as (x1, y1, x2, y2) normalized coordinates.
    """

    frame_index: int
    mask: np.ndarray
    confidence: float
    bounding_box: tuple[float, float, float, float] | None = None

    def __post_init__(self) -> None:
        """Validate mask shape and values."""
        if self.mask.ndim != 2:
            raise ValueError(f"Mask must be 2D, got shape {self.mask.shape}")
        if not 0.0 <= self.confidence <= 1.0:
            raise ValueError(f"Confidence must be between 0 and 1, got {self.confidence}")

    @property
    def height(self) -> int:
        """Height of the mask."""
        return self.mask.shape[0]

    @property
    def width(self) -> int:
        """Width of the mask."""
        return self.mask.shape[1]

    @property
    def area(self) -> int:
        """Number of pixels in the mask."""
        return int(np.sum(self.mask > 0))

    def dilate(self, kernel_size: int = 5) -> ObjectMask:
        """Return a dilated version of this mask.

        Args:
            kernel_size: Size of the dilation kernel.

        Returns:
            New ObjectMask with dilated mask.
        """
        import cv2

        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
        dilated = cv2.dilate(self.mask.astype(np.uint8), kernel, iterations=1)
        return ObjectMask(
            frame_index=self.frame_index,
            mask=dilated.astype(bool),
            confidence=self.confidence,
            bounding_box=self.bounding_box,
        )

height `property`

height: int

Height of the mask.

width `property`

width: int

Width of the mask.

area `property`

area: int

Number of pixels in the mask.

__post_init__

__post_init__() -> None

Validate mask shape and values.

Source code in src/videopython/ai/swapping/models.py

def __post_init__(self) -> None:
    """Validate mask shape and values."""
    if self.mask.ndim != 2:
        raise ValueError(f"Mask must be 2D, got shape {self.mask.shape}")
    if not 0.0 <= self.confidence <= 1.0:
        raise ValueError(f"Confidence must be between 0 and 1, got {self.confidence}")

dilate

dilate(kernel_size: int = 5) -> ObjectMask

Return a dilated version of this mask.

Parameters:

Name	Type	Description	Default
`kernel_size`	`int`	Size of the dilation kernel.	`5`

Returns:

Type	Description
`ObjectMask`	New ObjectMask with dilated mask.

Source code in src/videopython/ai/swapping/models.py

def dilate(self, kernel_size: int = 5) -> ObjectMask:
    """Return a dilated version of this mask.

    Args:
        kernel_size: Size of the dilation kernel.

    Returns:
        New ObjectMask with dilated mask.
    """
    import cv2

    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
    dilated = cv2.dilate(self.mask.astype(np.uint8), kernel, iterations=1)
    return ObjectMask(
        frame_index=self.frame_index,
        mask=dilated.astype(bool),
        confidence=self.confidence,
        bounding_box=self.bounding_box,
    )

AI Object Swapping

Local Pipeline

ObjectSwapper

Swap Object with Generated Content

Swap Object with Image

Remove Object

Segment Only

Visualize Tracking

Progress Tracking

ObjectSwapper

Option A: Generate replacement from prompt

Option B: Use provided image

Get result

__init__

swap

swap_with_image

remove_object

segment_only

visualize_track staticmethod

SwapResult

SwapResult dataclass

num_frames property

frame_size property

has_inpainted_frames property

ObjectTrack

ObjectTrack dataclass

num_frames property

frame_indices property

average_confidence property

get_mask_for_frame

get_masks_array

ObjectMask

ObjectMask dataclass

height property

width property

area property

__post_init__

dilate

init

visualize_track `staticmethod`

SwapResult `dataclass`

num_frames `property`

frame_size `property`

has_inpainted_frames `property`

ObjectTrack `dataclass`

num_frames `property`

frame_indices `property`

average_confidence `property`

ObjectMask `dataclass`

height `property`

width `property`

area `property`