Skip to content

AI Object Swapping

Replace, remove, or modify objects in videos using AI-powered segmentation and inpainting.

Local Pipeline

Object swapping uses local SAM2 + GroundingDINO segmentation with SDXL inpainting/compositing.

ObjectSwapper

Main class for object manipulation in videos.

Swap Object with Generated Content

Replace an object with AI-generated content from a text prompt:

from videopython.base import Video
from videopython.ai import ObjectSwapper

video = Video.from_path("street.mp4")
swapper = ObjectSwapper()

# Replace red car with a blue motorcycle
result = swapper.swap(
    video=video,
    source_object="red car",
    target_object="blue motorcycle",
)

# Create video from swapped frames
swapped_video = Video.from_frames(result.swapped_frames, video.fps)
swapped_video.save("swapped.mp4")

Swap Object with Image

Replace an object with a provided image:

result = swapper.swap_with_image(
    video=video,
    source_object="red car",
    replacement_image="motorcycle.png",
)

Remove Object

Remove an object and fill with background:

result = swapper.remove_object(
    video=video,
    object_prompt="red car",
)

Segment Only

Get object masks without modifying the video:

track = swapper.segment_only(
    video=video,
    object_prompt="person",
)

print(f"Tracked {len(track.masks)} frames")
for mask in track.masks:
    print(f"Frame {mask.frame_index}: confidence {mask.confidence:.2f}")

Visualize Tracking

Debug visualization of tracked object:

debug_frames = swapper.visualize_track(video, track)
debug_video = Video.from_frames(debug_frames, video.fps)
debug_video.save("debug_tracking.mp4")

Progress Tracking

def on_progress(stage: str, progress: float) -> None:
    print(f"[{progress*100:5.1f}%] {stage}")

result = swapper.swap(
    video=video,
    source_object="red car",
    target_object="blue motorcycle",
    progress_callback=on_progress,
)

ObjectSwapper

Swaps objects in videos using segmentation, inpainting, and compositing.

The object swapping pipeline: 1. Segment source object using SAM2 (track across frames) 2. Inpaint background where object was removed 3. Composite replacement (generated or provided image) into cleaned background

Example

from videopython.base.video import Video from videopython.ai.swapping import ObjectSwapper

video = Video.from_path("street.mp4") swapper = ObjectSwapper()

Option A: Generate replacement from prompt

result = swapper.swap(video, source_object="red car", target_object="blue motorcycle")

Option B: Use provided image

result = swapper.swap_with_image( ... video, source_object="red car", replacement_image="bike.png" ... )

Get result

swapped_video = Video.from_frames(result.swapped_frames, video.fps)

Source code in src/videopython/ai/swapping/swapper.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
class ObjectSwapper:
    """Swaps objects in videos using segmentation, inpainting, and compositing.

    The object swapping pipeline:
    1. Segment source object using SAM2 (track across frames)
    2. Inpaint background where object was removed
    3. Composite replacement (generated or provided image) into cleaned background

    Example:
        >>> from videopython.base.video import Video
        >>> from videopython.ai.swapping import ObjectSwapper
        >>>
        >>> video = Video.from_path("street.mp4")
        >>> swapper = ObjectSwapper()
        >>>
        >>> # Option A: Generate replacement from prompt
        >>> result = swapper.swap(video, source_object="red car", target_object="blue motorcycle")
        >>>
        >>> # Option B: Use provided image
        >>> result = swapper.swap_with_image(
        ...     video, source_object="red car", replacement_image="bike.png"
        ... )
        >>>
        >>> # Get result
        >>> swapped_video = Video.from_frames(result.swapped_frames, video.fps)
    """

    def __init__(
        self,
        config: SwapConfig | None = None,
        device: str | None = None,
    ):
        """Initialize the object swapper.

        Args:
            config: Configuration for the swapping pipeline.
            device: Device for local models ('cuda', 'mps', or 'cpu').
        """
        self.config = config or SwapConfig()
        self.device = device

        # Lazy-loaded components
        self._segmenter: ObjectSegmenter | None = None
        self._inpainter: VideoInpainter | None = None
        self._image_generator: Any = None

    def _get_segmenter(self) -> ObjectSegmenter:
        """Get or create the object segmenter."""
        if self._segmenter is None:
            self._segmenter = ObjectSegmenter(
                config=self.config.segmentation,
                device=self.device,
            )
        return self._segmenter

    def _get_inpainter(self) -> VideoInpainter:
        """Get or create the video inpainter."""
        if self._inpainter is None:
            self._inpainter = VideoInpainter(
                config=self.config.inpainting,
                device=self.device,
            )
        return self._inpainter

    def _get_image_generator(self) -> Any:
        """Get or create the image generator for target object generation."""
        if self._image_generator is None:
            from videopython.ai.generation import TextToImage

            self._image_generator = TextToImage()
        return self._image_generator

    def _generate_replacement_image(
        self,
        target_prompt: str,
        width: int,
        height: int,
    ) -> np.ndarray:
        """Generate a replacement image from a text prompt.

        Args:
            target_prompt: Text description of replacement object.
            width: Target width.
            height: Target height.

        Returns:
            Generated image as RGB array.
        """
        generator = self._get_image_generator()
        from PIL import Image

        image = generator.generate_image(target_prompt)
        image = image.resize((width, height), Image.Resampling.LANCZOS)

        return np.array(image)

    def _load_replacement_image(self, image_path: str | Path) -> np.ndarray:
        """Load a replacement image from file.

        Args:
            image_path: Path to the replacement image.

        Returns:
            Image as RGB array.
        """
        from PIL import Image

        image = Image.open(image_path).convert("RGB")
        return np.array(image)

    def _composite_replacement(
        self,
        background: np.ndarray,
        replacement: np.ndarray,
        mask: np.ndarray,
        blend_factor: float = 0.5,
    ) -> np.ndarray:
        """Composite replacement image onto background using mask.

        Args:
            background: Background frame of shape (H, W, C).
            replacement: Replacement image of shape (H, W, C).
            mask: Binary mask of shape (H, W) indicating replacement region.
            blend_factor: Edge blending factor (0=hard, 1=soft).

        Returns:
            Composited frame of shape (H, W, C).
        """
        import cv2

        # Resize replacement to match background
        h, w = background.shape[:2]
        replacement_resized = cv2.resize(replacement, (w, h), interpolation=cv2.INTER_LINEAR)

        # Create soft edge mask if blend_factor > 0
        if blend_factor > 0:
            # Blur mask edges
            blur_size = max(3, int(min(h, w) * blend_factor * 0.05))
            if blur_size % 2 == 0:
                blur_size += 1
            soft_mask = cv2.GaussianBlur(mask.astype(np.float32), (blur_size, blur_size), 0)  # type: ignore[type-var]
        else:
            soft_mask = mask.astype(np.float32)

        # Expand mask to 3 channels
        soft_mask_3d = soft_mask[:, :, np.newaxis]

        # Composite: result = bg * (1 - mask) + replacement * mask
        result = background * (1 - soft_mask_3d) + replacement_resized * soft_mask_3d

        return result.astype(np.uint8)

    def _composite_video(
        self,
        inpainted_frames: np.ndarray,
        replacement: np.ndarray,
        track: Any,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> np.ndarray:
        """Composite replacement onto all video frames.

        Args:
            inpainted_frames: Background frames with object removed.
            replacement: Replacement image.
            track: Object track with masks.
            progress_callback: Progress callback.

        Returns:
            Composited video frames.
        """
        num_frames = inpainted_frames.shape[0]
        composited = []

        for i in range(num_frames):
            if progress_callback:
                progress_callback("Compositing frames", i / num_frames)

            mask_obj = track.get_mask_for_frame(i)
            if mask_obj is None or mask_obj.area == 0:
                # No mask, keep inpainted frame
                composited.append(inpainted_frames[i])
                continue

            # Composite replacement
            frame = self._composite_replacement(
                background=inpainted_frames[i],
                replacement=replacement,
                mask=mask_obj.mask,
                blend_factor=self.config.composite_blend,
            )
            composited.append(frame)

        if progress_callback:
            progress_callback("Compositing complete", 1.0)

        return np.stack(composited, axis=0)

    def swap(
        self,
        video: Video,
        source_object: str,
        target_object: str,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Swap an object in video with a generated replacement.

        Segments the source object, removes it via inpainting, and composites
        a generated replacement image based on the target prompt.

        Args:
            video: Input video to process.
            source_object: Text description of object to replace (e.g., "red car").
            target_object: Text description of replacement object (e.g., "blue motorcycle").
            reference_frame: Frame index for initial segmentation. Default: config value.
            progress_callback: Optional callback for progress updates.
                Called with (stage_name, progress_fraction).

        Returns:
            SwapResult containing swapped frames and metadata.

        Example:
            >>> result = swapper.swap(video, "person", "robot")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment source object
        if progress_callback:
            progress_callback("Segmenting source object", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
        )

        # Stage 2: Inpaint background
        if progress_callback:
            progress_callback("Inpainting background", 0.3)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt="background, seamless, natural",
            progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.3) if progress_callback else None,
        )

        # Stage 3: Generate replacement image
        if progress_callback:
            progress_callback("Generating replacement object", 0.6)

        h, w = frames.shape[1:3]
        replacement = self._generate_replacement_image(
            target_prompt=target_object,
            width=w,
            height=h,
        )

        # Stage 4: Composite replacement
        if progress_callback:
            progress_callback("Compositing frames", 0.7)

        swapped_frames = self._composite_video(
            inpainted_frames=inpainted_frames,
            replacement=replacement,
            track=track,
            progress_callback=lambda msg, p: progress_callback(msg, 0.7 + p * 0.3) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=swapped_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
            target_prompt=target_object,
        )

    def swap_with_image(
        self,
        video: Video,
        source_object: str,
        replacement_image: str | Path,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Swap an object in video with a provided replacement image.

        Segments the source object, removes it via inpainting, and composites
        the provided replacement image in its place.

        Args:
            video: Input video to process.
            source_object: Text description of object to replace (e.g., "red car").
            replacement_image: Path to replacement image file.
            reference_frame: Frame index for initial segmentation. Default: config value.
            progress_callback: Optional callback for progress updates.

        Returns:
            SwapResult containing swapped frames and metadata.

        Example:
            >>> result = swapper.swap_with_image(video, "logo", "new_logo.png")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment source object
        if progress_callback:
            progress_callback("Segmenting source object", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
        )

        # Stage 2: Inpaint background
        if progress_callback:
            progress_callback("Inpainting background", 0.3)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt="background, seamless, natural",
            progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.4) if progress_callback else None,
        )

        # Stage 3: Load replacement image
        if progress_callback:
            progress_callback("Loading replacement image", 0.7)

        replacement = self._load_replacement_image(replacement_image)

        # Stage 4: Composite replacement
        if progress_callback:
            progress_callback("Compositing frames", 0.75)

        swapped_frames = self._composite_video(
            inpainted_frames=inpainted_frames,
            replacement=replacement,
            track=track,
            progress_callback=lambda msg, p: progress_callback(msg, 0.75 + p * 0.25) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=swapped_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
            replacement_image=str(replacement_image),
        )

    def remove_object(
        self,
        video: Video,
        source_object: str,
        reference_frame: int | None = None,
        inpaint_prompt: str = "background, seamless, natural",
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Remove an object from video without replacement.

        Segments the object and inpaints the background to remove it cleanly.

        Args:
            video: Input video to process.
            source_object: Text description of object to remove.
            reference_frame: Frame index for initial segmentation.
            inpaint_prompt: Prompt to guide background generation.
            progress_callback: Optional progress callback.

        Returns:
            SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

        Example:
            >>> result = swapper.remove_object(video, "watermark")
            >>> Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        # Stage 1: Segment object
        if progress_callback:
            progress_callback("Segmenting object to remove", 0.0)

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=lambda msg, p: progress_callback(msg, p * 0.4) if progress_callback else None,
        )

        # Stage 2: Inpaint to remove
        if progress_callback:
            progress_callback("Removing object", 0.4)

        inpainter = self._get_inpainter()
        inpainted_frames = inpainter.inpaint(
            frames=frames,
            track=track,
            prompt=inpaint_prompt,
            progress_callback=lambda msg, p: progress_callback(msg, 0.4 + p * 0.6) if progress_callback else None,
        )

        if progress_callback:
            progress_callback("Complete", 1.0)

        return SwapResult(
            swapped_frames=inpainted_frames,
            object_track=track,
            inpainted_frames=inpainted_frames,
            source_prompt=source_object,
        )

    def segment_only(
        self,
        video: Video,
        source_object: str,
        reference_frame: int | None = None,
        progress_callback: Callable[[str, float], None] | None = None,
    ) -> SwapResult:
        """Segment an object without swapping or inpainting.

        Useful for previewing segmentation results before full processing.

        Args:
            video: Input video to process.
            source_object: Text description of object to segment.
            reference_frame: Frame index for initial segmentation.
            progress_callback: Optional progress callback.

        Returns:
            SwapResult with original frames and object track (no swapping performed).
        """
        ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
        frames = video.frames

        segmenter = self._get_segmenter()
        track = segmenter.segment_object(
            frames=frames,
            prompt=source_object,
            reference_frame=ref_frame,
            progress_callback=progress_callback,
        )

        return SwapResult(
            swapped_frames=frames.copy(),
            object_track=track,
            source_prompt=source_object,
        )

    @staticmethod
    def visualize_track(
        frames: np.ndarray,
        track: Any,
        color: tuple[int, int, int] = (255, 0, 0),
        alpha: float = 0.5,
    ) -> np.ndarray:
        """Overlay object masks on video frames for visualization.

        Args:
            frames: Video frames array of shape (N, H, W, C).
            track: ObjectTrack to visualize.
            color: RGB color for mask overlay.
            alpha: Opacity of mask overlay (0-1).

        Returns:
            Frames with mask overlay.
        """
        visualized = frames.copy()
        overlay_color = np.array(color, dtype=np.float32)

        for i in range(frames.shape[0]):
            mask_obj = track.get_mask_for_frame(i)
            if mask_obj is None or mask_obj.area == 0:
                continue

            mask = mask_obj.mask
            mask_3d = mask[:, :, np.newaxis]

            # Blend original with colored overlay in masked region
            original = visualized[i].astype(np.float32)
            colored = original * (1 - alpha) + overlay_color * alpha
            visualized[i] = np.where(mask_3d, colored, original).astype(np.uint8)

        return visualized

__init__

__init__(
    config: SwapConfig | None = None,
    device: str | None = None,
)

Initialize the object swapper.

Parameters:

Name Type Description Default
config SwapConfig | None

Configuration for the swapping pipeline.

None
device str | None

Device for local models ('cuda', 'mps', or 'cpu').

None
Source code in src/videopython/ai/swapping/swapper.py
def __init__(
    self,
    config: SwapConfig | None = None,
    device: str | None = None,
):
    """Initialize the object swapper.

    Args:
        config: Configuration for the swapping pipeline.
        device: Device for local models ('cuda', 'mps', or 'cpu').
    """
    self.config = config or SwapConfig()
    self.device = device

    # Lazy-loaded components
    self._segmenter: ObjectSegmenter | None = None
    self._inpainter: VideoInpainter | None = None
    self._image_generator: Any = None

swap

swap(
    video: Video,
    source_object: str,
    target_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Swap an object in video with a generated replacement.

Segments the source object, removes it via inpainting, and composites a generated replacement image based on the target prompt.

Parameters:

Name Type Description Default
video Video

Input video to process.

required
source_object str

Text description of object to replace (e.g., "red car").

required
target_object str

Text description of replacement object (e.g., "blue motorcycle").

required
reference_frame int | None

Frame index for initial segmentation. Default: config value.

None
progress_callback Callable[[str, float], None] | None

Optional callback for progress updates. Called with (stage_name, progress_fraction).

None

Returns:

Type Description
SwapResult

SwapResult containing swapped frames and metadata.

Example

result = swapper.swap(video, "person", "robot") Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")

Source code in src/videopython/ai/swapping/swapper.py
def swap(
    self,
    video: Video,
    source_object: str,
    target_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Swap an object in video with a generated replacement.

    Segments the source object, removes it via inpainting, and composites
    a generated replacement image based on the target prompt.

    Args:
        video: Input video to process.
        source_object: Text description of object to replace (e.g., "red car").
        target_object: Text description of replacement object (e.g., "blue motorcycle").
        reference_frame: Frame index for initial segmentation. Default: config value.
        progress_callback: Optional callback for progress updates.
            Called with (stage_name, progress_fraction).

    Returns:
        SwapResult containing swapped frames and metadata.

    Example:
        >>> result = swapper.swap(video, "person", "robot")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment source object
    if progress_callback:
        progress_callback("Segmenting source object", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
    )

    # Stage 2: Inpaint background
    if progress_callback:
        progress_callback("Inpainting background", 0.3)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt="background, seamless, natural",
        progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.3) if progress_callback else None,
    )

    # Stage 3: Generate replacement image
    if progress_callback:
        progress_callback("Generating replacement object", 0.6)

    h, w = frames.shape[1:3]
    replacement = self._generate_replacement_image(
        target_prompt=target_object,
        width=w,
        height=h,
    )

    # Stage 4: Composite replacement
    if progress_callback:
        progress_callback("Compositing frames", 0.7)

    swapped_frames = self._composite_video(
        inpainted_frames=inpainted_frames,
        replacement=replacement,
        track=track,
        progress_callback=lambda msg, p: progress_callback(msg, 0.7 + p * 0.3) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=swapped_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
        target_prompt=target_object,
    )

swap_with_image

swap_with_image(
    video: Video,
    source_object: str,
    replacement_image: str | Path,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Swap an object in video with a provided replacement image.

Segments the source object, removes it via inpainting, and composites the provided replacement image in its place.

Parameters:

Name Type Description Default
video Video

Input video to process.

required
source_object str

Text description of object to replace (e.g., "red car").

required
replacement_image str | Path

Path to replacement image file.

required
reference_frame int | None

Frame index for initial segmentation. Default: config value.

None
progress_callback Callable[[str, float], None] | None

Optional callback for progress updates.

None

Returns:

Type Description
SwapResult

SwapResult containing swapped frames and metadata.

Example

result = swapper.swap_with_image(video, "logo", "new_logo.png") Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")

Source code in src/videopython/ai/swapping/swapper.py
def swap_with_image(
    self,
    video: Video,
    source_object: str,
    replacement_image: str | Path,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Swap an object in video with a provided replacement image.

    Segments the source object, removes it via inpainting, and composites
    the provided replacement image in its place.

    Args:
        video: Input video to process.
        source_object: Text description of object to replace (e.g., "red car").
        replacement_image: Path to replacement image file.
        reference_frame: Frame index for initial segmentation. Default: config value.
        progress_callback: Optional callback for progress updates.

    Returns:
        SwapResult containing swapped frames and metadata.

    Example:
        >>> result = swapper.swap_with_image(video, "logo", "new_logo.png")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("output.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment source object
    if progress_callback:
        progress_callback("Segmenting source object", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.3) if progress_callback else None,
    )

    # Stage 2: Inpaint background
    if progress_callback:
        progress_callback("Inpainting background", 0.3)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt="background, seamless, natural",
        progress_callback=lambda msg, p: progress_callback(msg, 0.3 + p * 0.4) if progress_callback else None,
    )

    # Stage 3: Load replacement image
    if progress_callback:
        progress_callback("Loading replacement image", 0.7)

    replacement = self._load_replacement_image(replacement_image)

    # Stage 4: Composite replacement
    if progress_callback:
        progress_callback("Compositing frames", 0.75)

    swapped_frames = self._composite_video(
        inpainted_frames=inpainted_frames,
        replacement=replacement,
        track=track,
        progress_callback=lambda msg, p: progress_callback(msg, 0.75 + p * 0.25) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=swapped_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
        replacement_image=str(replacement_image),
    )

remove_object

remove_object(
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    inpaint_prompt: str = "background, seamless, natural",
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Remove an object from video without replacement.

Segments the object and inpaints the background to remove it cleanly.

Parameters:

Name Type Description Default
video Video

Input video to process.

required
source_object str

Text description of object to remove.

required
reference_frame int | None

Frame index for initial segmentation.

None
inpaint_prompt str

Prompt to guide background generation.

'background, seamless, natural'
progress_callback Callable[[str, float], None] | None

Optional progress callback.

None

Returns:

Type Description
SwapResult

SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

Example

result = swapper.remove_object(video, "watermark") Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")

Source code in src/videopython/ai/swapping/swapper.py
def remove_object(
    self,
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    inpaint_prompt: str = "background, seamless, natural",
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Remove an object from video without replacement.

    Segments the object and inpaints the background to remove it cleanly.

    Args:
        video: Input video to process.
        source_object: Text description of object to remove.
        reference_frame: Frame index for initial segmentation.
        inpaint_prompt: Prompt to guide background generation.
        progress_callback: Optional progress callback.

    Returns:
        SwapResult with inpainted frames (swapped_frames equals inpainted_frames).

    Example:
        >>> result = swapper.remove_object(video, "watermark")
        >>> Video.from_frames(result.swapped_frames, video.fps).save("clean.mp4")
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    # Stage 1: Segment object
    if progress_callback:
        progress_callback("Segmenting object to remove", 0.0)

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=lambda msg, p: progress_callback(msg, p * 0.4) if progress_callback else None,
    )

    # Stage 2: Inpaint to remove
    if progress_callback:
        progress_callback("Removing object", 0.4)

    inpainter = self._get_inpainter()
    inpainted_frames = inpainter.inpaint(
        frames=frames,
        track=track,
        prompt=inpaint_prompt,
        progress_callback=lambda msg, p: progress_callback(msg, 0.4 + p * 0.6) if progress_callback else None,
    )

    if progress_callback:
        progress_callback("Complete", 1.0)

    return SwapResult(
        swapped_frames=inpainted_frames,
        object_track=track,
        inpainted_frames=inpainted_frames,
        source_prompt=source_object,
    )

segment_only

segment_only(
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None]
    | None = None,
) -> SwapResult

Segment an object without swapping or inpainting.

Useful for previewing segmentation results before full processing.

Parameters:

Name Type Description Default
video Video

Input video to process.

required
source_object str

Text description of object to segment.

required
reference_frame int | None

Frame index for initial segmentation.

None
progress_callback Callable[[str, float], None] | None

Optional progress callback.

None

Returns:

Type Description
SwapResult

SwapResult with original frames and object track (no swapping performed).

Source code in src/videopython/ai/swapping/swapper.py
def segment_only(
    self,
    video: Video,
    source_object: str,
    reference_frame: int | None = None,
    progress_callback: Callable[[str, float], None] | None = None,
) -> SwapResult:
    """Segment an object without swapping or inpainting.

    Useful for previewing segmentation results before full processing.

    Args:
        video: Input video to process.
        source_object: Text description of object to segment.
        reference_frame: Frame index for initial segmentation.
        progress_callback: Optional progress callback.

    Returns:
        SwapResult with original frames and object track (no swapping performed).
    """
    ref_frame = reference_frame if reference_frame is not None else self.config.reference_frame
    frames = video.frames

    segmenter = self._get_segmenter()
    track = segmenter.segment_object(
        frames=frames,
        prompt=source_object,
        reference_frame=ref_frame,
        progress_callback=progress_callback,
    )

    return SwapResult(
        swapped_frames=frames.copy(),
        object_track=track,
        source_prompt=source_object,
    )

visualize_track staticmethod

visualize_track(
    frames: ndarray,
    track: Any,
    color: tuple[int, int, int] = (255, 0, 0),
    alpha: float = 0.5,
) -> np.ndarray

Overlay object masks on video frames for visualization.

Parameters:

Name Type Description Default
frames ndarray

Video frames array of shape (N, H, W, C).

required
track Any

ObjectTrack to visualize.

required
color tuple[int, int, int]

RGB color for mask overlay.

(255, 0, 0)
alpha float

Opacity of mask overlay (0-1).

0.5

Returns:

Type Description
ndarray

Frames with mask overlay.

Source code in src/videopython/ai/swapping/swapper.py
@staticmethod
def visualize_track(
    frames: np.ndarray,
    track: Any,
    color: tuple[int, int, int] = (255, 0, 0),
    alpha: float = 0.5,
) -> np.ndarray:
    """Overlay object masks on video frames for visualization.

    Args:
        frames: Video frames array of shape (N, H, W, C).
        track: ObjectTrack to visualize.
        color: RGB color for mask overlay.
        alpha: Opacity of mask overlay (0-1).

    Returns:
        Frames with mask overlay.
    """
    visualized = frames.copy()
    overlay_color = np.array(color, dtype=np.float32)

    for i in range(frames.shape[0]):
        mask_obj = track.get_mask_for_frame(i)
        if mask_obj is None or mask_obj.area == 0:
            continue

        mask = mask_obj.mask
        mask_3d = mask[:, :, np.newaxis]

        # Blend original with colored overlay in masked region
        original = visualized[i].astype(np.float32)
        colored = original * (1 - alpha) + overlay_color * alpha
        visualized[i] = np.where(mask_3d, colored, original).astype(np.uint8)

    return visualized

SwapResult

Result of a swap or remove operation.

result = swapper.swap(video, "car", "truck")

print(f"Processed {len(result.swapped_frames)} frames")
print(f"Object tracked: {result.source_object}")
print(f"Track confidence: {result.track.masks[0].confidence:.2f}")

SwapResult dataclass

Result of an object swapping operation.

Attributes:

Name Type Description
swapped_frames ndarray

Array of frames with object swapped, shape (N, H, W, C).

object_track ObjectTrack

The tracked object that was swapped.

inpainted_frames ndarray | None

Frames with object removed (background only), shape (N, H, W, C).

source_prompt str

Text prompt used to identify source object.

target_prompt str

Text prompt for the replacement object (if generated).

replacement_image str | None

Path to replacement image (if provided).

Source code in src/videopython/ai/swapping/models.py
@dataclass
class SwapResult:
    """Result of an object swapping operation.

    Attributes:
        swapped_frames: Array of frames with object swapped, shape (N, H, W, C).
        object_track: The tracked object that was swapped.
        inpainted_frames: Frames with object removed (background only), shape (N, H, W, C).
        source_prompt: Text prompt used to identify source object.
        target_prompt: Text prompt for the replacement object (if generated).
        replacement_image: Path to replacement image (if provided).
    """

    swapped_frames: np.ndarray
    object_track: ObjectTrack
    inpainted_frames: np.ndarray | None = None
    source_prompt: str = ""
    target_prompt: str = ""
    replacement_image: str | None = None

    @property
    def num_frames(self) -> int:
        """Number of frames in the result."""
        return self.swapped_frames.shape[0]

    @property
    def frame_size(self) -> tuple[int, int]:
        """Size of frames as (height, width)."""
        return (self.swapped_frames.shape[1], self.swapped_frames.shape[2])

    @property
    def has_inpainted_frames(self) -> bool:
        """Check if inpainted frames are available."""
        return self.inpainted_frames is not None

num_frames property

num_frames: int

Number of frames in the result.

frame_size property

frame_size: tuple[int, int]

Size of frames as (height, width).

has_inpainted_frames property

has_inpainted_frames: bool

Check if inpainted frames are available.

ObjectTrack

Tracked object across multiple frames.

ObjectTrack dataclass

A tracked object across multiple frames.

Attributes:

Name Type Description
object_id str

Unique identifier for this tracked object.

masks list[ObjectMask]

List of ObjectMask instances for each frame where object appears.

label str

Text label describing the object (e.g., "red car").

start_frame int

First frame index where object appears.

end_frame int

Last frame index where object appears.

Source code in src/videopython/ai/swapping/models.py
@dataclass
class ObjectTrack:
    """A tracked object across multiple frames.

    Attributes:
        object_id: Unique identifier for this tracked object.
        masks: List of ObjectMask instances for each frame where object appears.
        label: Text label describing the object (e.g., "red car").
        start_frame: First frame index where object appears.
        end_frame: Last frame index where object appears.
    """

    object_id: str
    masks: list[ObjectMask]
    label: str
    start_frame: int
    end_frame: int

    @property
    def num_frames(self) -> int:
        """Number of frames this object appears in."""
        return len(self.masks)

    @property
    def frame_indices(self) -> list[int]:
        """List of frame indices where object appears."""
        return [m.frame_index for m in self.masks]

    @property
    def average_confidence(self) -> float:
        """Average confidence across all masks."""
        if not self.masks:
            return 0.0
        return sum(m.confidence for m in self.masks) / len(self.masks)

    def get_mask_for_frame(self, frame_index: int) -> ObjectMask | None:
        """Get the mask for a specific frame.

        Args:
            frame_index: The frame index to look up.

        Returns:
            The ObjectMask for that frame, or None if not present.
        """
        for mask in self.masks:
            if mask.frame_index == frame_index:
                return mask
        return None

    def get_masks_array(self) -> np.ndarray:
        """Get all masks as a stacked numpy array.

        Returns:
            Array of shape (N, H, W) where N is number of frames.
        """
        if not self.masks:
            raise ValueError("No masks in track")
        return np.stack([m.mask for m in self.masks], axis=0)

num_frames property

num_frames: int

Number of frames this object appears in.

frame_indices property

frame_indices: list[int]

List of frame indices where object appears.

average_confidence property

average_confidence: float

Average confidence across all masks.

get_mask_for_frame

get_mask_for_frame(frame_index: int) -> ObjectMask | None

Get the mask for a specific frame.

Parameters:

Name Type Description Default
frame_index int

The frame index to look up.

required

Returns:

Type Description
ObjectMask | None

The ObjectMask for that frame, or None if not present.

Source code in src/videopython/ai/swapping/models.py
def get_mask_for_frame(self, frame_index: int) -> ObjectMask | None:
    """Get the mask for a specific frame.

    Args:
        frame_index: The frame index to look up.

    Returns:
        The ObjectMask for that frame, or None if not present.
    """
    for mask in self.masks:
        if mask.frame_index == frame_index:
            return mask
    return None

get_masks_array

get_masks_array() -> np.ndarray

Get all masks as a stacked numpy array.

Returns:

Type Description
ndarray

Array of shape (N, H, W) where N is number of frames.

Source code in src/videopython/ai/swapping/models.py
def get_masks_array(self) -> np.ndarray:
    """Get all masks as a stacked numpy array.

    Returns:
        Array of shape (N, H, W) where N is number of frames.
    """
    if not self.masks:
        raise ValueError("No masks in track")
    return np.stack([m.mask for m in self.masks], axis=0)

ObjectMask

Single-frame object mask with confidence and bounding box.

ObjectMask dataclass

A mask representing an object in a single frame.

Attributes:

Name Type Description
frame_index int

Index of the frame this mask belongs to.

mask ndarray

Binary mask array of shape (H, W) where True indicates object pixels.

confidence float

Confidence score of the segmentation (0.0 to 1.0).

bounding_box tuple[float, float, float, float] | None

Optional bounding box as (x1, y1, x2, y2) normalized coordinates.

Source code in src/videopython/ai/swapping/models.py
@dataclass
class ObjectMask:
    """A mask representing an object in a single frame.

    Attributes:
        frame_index: Index of the frame this mask belongs to.
        mask: Binary mask array of shape (H, W) where True indicates object pixels.
        confidence: Confidence score of the segmentation (0.0 to 1.0).
        bounding_box: Optional bounding box as (x1, y1, x2, y2) normalized coordinates.
    """

    frame_index: int
    mask: np.ndarray
    confidence: float
    bounding_box: tuple[float, float, float, float] | None = None

    def __post_init__(self) -> None:
        """Validate mask shape and values."""
        if self.mask.ndim != 2:
            raise ValueError(f"Mask must be 2D, got shape {self.mask.shape}")
        if not 0.0 <= self.confidence <= 1.0:
            raise ValueError(f"Confidence must be between 0 and 1, got {self.confidence}")

    @property
    def height(self) -> int:
        """Height of the mask."""
        return self.mask.shape[0]

    @property
    def width(self) -> int:
        """Width of the mask."""
        return self.mask.shape[1]

    @property
    def area(self) -> int:
        """Number of pixels in the mask."""
        return int(np.sum(self.mask > 0))

    def dilate(self, kernel_size: int = 5) -> ObjectMask:
        """Return a dilated version of this mask.

        Args:
            kernel_size: Size of the dilation kernel.

        Returns:
            New ObjectMask with dilated mask.
        """
        import cv2

        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
        dilated = cv2.dilate(self.mask.astype(np.uint8), kernel, iterations=1)
        return ObjectMask(
            frame_index=self.frame_index,
            mask=dilated.astype(bool),
            confidence=self.confidence,
            bounding_box=self.bounding_box,
        )

height property

height: int

Height of the mask.

width property

width: int

Width of the mask.

area property

area: int

Number of pixels in the mask.

__post_init__

__post_init__() -> None

Validate mask shape and values.

Source code in src/videopython/ai/swapping/models.py
def __post_init__(self) -> None:
    """Validate mask shape and values."""
    if self.mask.ndim != 2:
        raise ValueError(f"Mask must be 2D, got shape {self.mask.shape}")
    if not 0.0 <= self.confidence <= 1.0:
        raise ValueError(f"Confidence must be between 0 and 1, got {self.confidence}")

dilate

dilate(kernel_size: int = 5) -> ObjectMask

Return a dilated version of this mask.

Parameters:

Name Type Description Default
kernel_size int

Size of the dilation kernel.

5

Returns:

Type Description
ObjectMask

New ObjectMask with dilated mask.

Source code in src/videopython/ai/swapping/models.py
def dilate(self, kernel_size: int = 5) -> ObjectMask:
    """Return a dilated version of this mask.

    Args:
        kernel_size: Size of the dilation kernel.

    Returns:
        New ObjectMask with dilated mask.
    """
    import cv2

    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
    dilated = cv2.dilate(self.mask.astype(np.uint8), kernel, iterations=1)
    return ObjectMask(
        frame_index=self.frame_index,
        mask=dilated.astype(bool),
        confidence=self.confidence,
        bounding_box=self.bounding_box,
    )