Skip to content

AI Auto-Editing

LLM-authored editing: build a scene catalog from one or more sources and let a local vision-language model plan a VideoEdit from it. See the Automatic Editing guide for the end-to-end flow, and the MCP server for the agent-driven variant.

Basic Usage

from videopython.ai import AutoEditor, OllamaVisionLLM

editor = AutoEditor(planner=OllamaVisionLLM(model="qwen3.6:27b"))
edit = editor.edit(["a.mp4", "b.mp4"], brief="A 15s teaser, most dynamic shot first.")
edit.run_to_file("teaser.mp4")

The catalog / by-id plan

build_catalog projects VideoAnalysis results into an EditCatalog of candidate CatalogScenes (each with a stable id, exact bounds, caption, transcript) plus one keyframe per scene. The planner authors an EditPlan whose segments reference scenes by scene_id; resolve_plan maps those ids back to a runnable VideoEdit. The model never authors timestamps.

from videopython.ai import VideoAnalyzer, build_catalog
from videopython.ai.auto_edit import EditPlan, resolve_plan

analyses = [VideoAnalyzer().analyze_path("a.mp4")]
bundle = build_catalog(analyses)            # bundle.catalog + bundle.keyframes
plan = EditPlan.model_validate({"segments": [{"scene_id": bundle.catalog.scenes[0].id}]})
edit = resolve_plan(plan, bundle.catalog)   # -> VideoEdit

Planner

AutoEditor

Plan a VideoEdit from source videos using a structured-vision planner.

Source code in src/videopython/ai/auto_edit/editor.py
class AutoEditor:
    """Plan a VideoEdit from source videos using a structured-vision planner."""

    def __init__(
        self,
        planner: StructuredVisionLLM,
        *,
        analyzer: VideoAnalyzer | None = None,
        max_rounds: int = 3,
        normalize_target: _NormalizeTarget = "largest",
    ) -> None:
        self.planner = planner
        self._analyzer = analyzer
        self.max_rounds = max_rounds
        self.normalize_target = normalize_target

    def edit(self, sources: Sequence[str | Path], brief: str, *, context: dict[str, Any] | None = None) -> VideoEdit:
        """Analyze ``sources`` and plan an edit for ``brief`` (runs the analyzer)."""
        analyses = [self._get_analyzer().analyze_path(source) for source in sources]
        return self.edit_from_analyses(analyses, brief, context=context)

    def edit_from_analyses(
        self, analyses: Sequence[VideoAnalysis], brief: str, *, context: dict[str, Any] | None = None
    ) -> VideoEdit:
        """Plan an edit from precomputed VideoAnalysis results (no model download)."""
        bundle = build_catalog(analyses)
        metadata = _metadata_by_source(analyses)
        run_context = _merge_context(analyses, context)
        schema = EditPlan.json_schema(strict=True)
        base_text, images = _build_prompt(brief, bundle)

        feedback: str | None = None
        for _ in range(self.max_rounds):
            text = base_text if feedback is None else f"{base_text}\n\n{feedback}"
            try:
                raw = self.planner.generate_json(system=_SYSTEM_PROMPT, text=text, images=images or None, schema=schema)
                edit = resolve_plan(EditPlan.model_validate(raw), bundle.catalog)
            except (PlannerError, ValidationError, UnknownSceneIdsError) as exc:
                feedback = _shape_feedback(exc)
                continue
            edit, _ = edit.repair(metadata, context=run_context, clamp_segment_end=True)
            edit, _ = edit.normalize_dimensions(metadata, self.normalize_target, context=run_context)
            errors = edit.check(metadata, context=run_context)
            if not errors:
                return edit
            feedback = "The previous plan had these problems:\n" + "\n".join(e.to_prompt_line() for e in errors)

        raise AutoEditError(f"No valid edit after {self.max_rounds} round(s). Last feedback:\n{feedback}")

    def _get_analyzer(self) -> VideoAnalyzer:
        if self._analyzer is None:
            from videopython.ai.video_analysis import VideoAnalyzer

            self._analyzer = VideoAnalyzer()
        return self._analyzer

edit

edit(
    sources: Sequence[str | Path],
    brief: str,
    *,
    context: dict[str, Any] | None = None,
) -> VideoEdit

Analyze sources and plan an edit for brief (runs the analyzer).

Source code in src/videopython/ai/auto_edit/editor.py
def edit(self, sources: Sequence[str | Path], brief: str, *, context: dict[str, Any] | None = None) -> VideoEdit:
    """Analyze ``sources`` and plan an edit for ``brief`` (runs the analyzer)."""
    analyses = [self._get_analyzer().analyze_path(source) for source in sources]
    return self.edit_from_analyses(analyses, brief, context=context)

edit_from_analyses

edit_from_analyses(
    analyses: Sequence[VideoAnalysis],
    brief: str,
    *,
    context: dict[str, Any] | None = None,
) -> VideoEdit

Plan an edit from precomputed VideoAnalysis results (no model download).

Source code in src/videopython/ai/auto_edit/editor.py
def edit_from_analyses(
    self, analyses: Sequence[VideoAnalysis], brief: str, *, context: dict[str, Any] | None = None
) -> VideoEdit:
    """Plan an edit from precomputed VideoAnalysis results (no model download)."""
    bundle = build_catalog(analyses)
    metadata = _metadata_by_source(analyses)
    run_context = _merge_context(analyses, context)
    schema = EditPlan.json_schema(strict=True)
    base_text, images = _build_prompt(brief, bundle)

    feedback: str | None = None
    for _ in range(self.max_rounds):
        text = base_text if feedback is None else f"{base_text}\n\n{feedback}"
        try:
            raw = self.planner.generate_json(system=_SYSTEM_PROMPT, text=text, images=images or None, schema=schema)
            edit = resolve_plan(EditPlan.model_validate(raw), bundle.catalog)
        except (PlannerError, ValidationError, UnknownSceneIdsError) as exc:
            feedback = _shape_feedback(exc)
            continue
        edit, _ = edit.repair(metadata, context=run_context, clamp_segment_end=True)
        edit, _ = edit.normalize_dimensions(metadata, self.normalize_target, context=run_context)
        errors = edit.check(metadata, context=run_context)
        if not errors:
            return edit
        feedback = "The previous plan had these problems:\n" + "\n".join(e.to_prompt_line() for e in errors)

    raise AutoEditError(f"No valid edit after {self.max_rounds} round(s). Last feedback:\n{feedback}")

OllamaVisionLLM

A StructuredVisionLLM backed by a local Ollama server.

The model must be vision-capable (it is sent keyframes) AND support Ollama's structured-output format (the EditPlan schema constrains the decode). The default qwen3.6:27b is an Apache-2.0 vision model; not every model supports schema conditioning (some builds, e.g. certain MLX ones, fail it), so confirm format works for a custom model locally. ollama pull <model> first; options are extra generation options merged over temperature=0.

Thin wrapper over the shared :class:OllamaStructuredClient: its only job is to translate :class:OllamaError into the :class:PlannerError the editor retries on.

Source code in src/videopython/ai/auto_edit/local.py
class OllamaVisionLLM:
    """A StructuredVisionLLM backed by a local Ollama server.

    The model must be vision-capable (it is sent keyframes) AND support Ollama's
    structured-output ``format`` (the EditPlan schema constrains the decode). The
    default ``qwen3.6:27b`` is an Apache-2.0 vision model; not every model supports
    schema conditioning (some builds, e.g. certain MLX ones, fail it), so confirm
    ``format`` works for a custom model locally. ``ollama pull <model>`` first;
    ``options`` are extra generation options merged over ``temperature=0``.

    Thin wrapper over the shared :class:`OllamaStructuredClient`: its only job is
    to translate :class:`OllamaError` into the :class:`PlannerError` the editor
    retries on.
    """

    def __init__(
        self,
        model: str = DEFAULT_OLLAMA_MODEL,
        *,
        host: str | None = None,
        options: dict[str, Any] | None = None,
    ) -> None:
        self._client = OllamaStructuredClient(model=model, host=host, options=options)

    def generate_json(
        self, *, system: str, text: str, images: list[np.ndarray] | None, schema: dict[str, Any]
    ) -> dict[str, Any]:
        try:
            return self._client.generate_json(system=system, text=text, schema=schema, images=images or None)
        except OllamaError as exc:
            raise PlannerError(str(exc)) from exc

StructuredVisionLLM

Bases: Protocol

Returns schema-shaped JSON from a system prompt + text + optional keyframes.

The signature mirrors :meth:videopython.ai._ollama.OllamaStructuredClient.generate_json, so any structured-generation client satisfies it structurally. Implementations raise :class:PlannerError on unusable output (the editor retries those); infra errors should propagate so they are not silently retried.

Source code in src/videopython/ai/auto_edit/backend.py
@runtime_checkable
class StructuredVisionLLM(Protocol):
    """Returns schema-shaped JSON from a system prompt + text + optional keyframes.

    The signature mirrors
    :meth:`videopython.ai._ollama.OllamaStructuredClient.generate_json`, so any
    structured-generation client satisfies it structurally. Implementations
    raise :class:`PlannerError` on unusable output (the editor retries those);
    infra errors should propagate so they are not silently retried.
    """

    def generate_json(
        self, *, system: str, text: str, images: list[np.ndarray] | None, schema: dict[str, Any]
    ) -> dict[str, Any]: ...

Catalog & plan

build_catalog

build_catalog(
    analyses: Sequence[VideoAnalysis],
    *,
    keyframes: bool = True,
    max_transcript_chars: int = DEFAULT_TRANSCRIPT_CHARS,
) -> CatalogBundle

Project VideoAnalysis results into a flat scene catalog with one midpoint keyframe per scene.

Source code in src/videopython/ai/auto_edit/catalog.py
def build_catalog(
    analyses: Sequence[VideoAnalysis],
    *,
    keyframes: bool = True,
    max_transcript_chars: int = DEFAULT_TRANSCRIPT_CHARS,
) -> CatalogBundle:
    """Project VideoAnalysis results into a flat scene catalog with one midpoint keyframe per scene."""
    scenes: list[CatalogScene] = []
    frames: dict[str, np.ndarray] = {}
    used_ids: set[str] = set()

    for analysis in analyses:
        source_path = analysis.source.path
        samples = analysis.scenes.samples if analysis.scenes else []
        transcription = analysis.audio.transcription if analysis.audio else None
        stem = Path(source_path).stem if source_path else "clip"

        for sample in samples:
            scene_id = _unique_id(stem, sample.scene_index, used_ids)
            caption, shot_type = _description(sample)
            transcript = _transcript_excerpt(
                transcription, sample.start_second, sample.end_second, max_transcript_chars
            )
            scenes.append(
                CatalogScene(
                    id=scene_id,
                    source=Path(source_path) if source_path else Path(stem),
                    start=sample.start_second,
                    end=sample.end_second,
                    duration=max(0.0, sample.end_second - sample.start_second),
                    shot_type=shot_type,
                    caption=caption,
                    transcript=transcript,
                    has_speech=bool(transcript),
                    has_faces=bool(sample.faces),
                )
            )
            if keyframes:
                if source_path is None:
                    raise ValueError(f"Scene {scene_id!r} has no source path to extract a keyframe from.")
                midpoint = (sample.start_second + sample.end_second) / 2.0
                frames[scene_id] = extract_frames_at_times(source_path, [midpoint])[0]

    return CatalogBundle(catalog=EditCatalog(scenes=scenes), keyframes=frames)

resolve_plan

resolve_plan(
    plan: EditPlan, catalog: EditCatalog
) -> VideoEdit

Map each plan segment's scene_id to its exact source/start/end.

Source code in src/videopython/ai/auto_edit/resolve.py
def resolve_plan(plan: EditPlan, catalog: EditCatalog) -> VideoEdit:
    """Map each plan segment's scene_id to its exact source/start/end."""
    by_id = catalog.by_id()
    unknown = [seg.scene_id for seg in plan.segments if seg.scene_id not in by_id]
    if unknown:
        raise UnknownSceneIdsError(unknown)
    segments = [
        SegmentConfig(
            source=by_id[seg.scene_id].source,
            start=by_id[seg.scene_id].start,
            end=by_id[seg.scene_id].end,
            operations=seg.operations,
            transition_in=seg.transition_in,
        )
        for seg in plan.segments
    ]
    return VideoEdit(segments=segments, post_operations=plan.post_operations)

EditPlan

Bases: BaseModel

The planner's output: an ordered selection of catalog scenes, referenced by id.

Source code in src/videopython/ai/auto_edit/models.py
class EditPlan(BaseModel):
    """The planner's output: an ordered selection of catalog scenes, referenced by id."""

    model_config = ConfigDict(extra="forbid")

    segments: list[PlanSegment] = Field(min_length=1, description="Ordered plan segments.")
    post_operations: list[OperationInput] = Field(
        default_factory=list, description="Operations applied once to the whole assembled program."
    )

    @classmethod
    def json_schema(cls, *, strict: bool = False) -> dict[str, Any]:
        """The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite."""
        op_schema = Operation.json_schema()

        segment_schema: dict[str, Any] = {
            "type": "object",
            "description": PlanSegment.__doc__,
            "properties": {
                "scene_id": field_schema(PlanSegment, "scene_id"),
                "operations": array_field_schema(PlanSegment, "operations", op_schema),
                "transition_in": optional_model_field_schema(TransitionSpec, PlanSegment, "transition_in"),
            },
            "required": ["scene_id"],
            "additionalProperties": False,
        }
        segments = field_schema(cls, "segments")
        segments["items"] = segment_schema
        schema: dict[str, Any] = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "description": cls.__doc__,
            "properties": {
                "segments": segments,
                "post_operations": array_field_schema(cls, "post_operations", op_schema),
            },
            "required": ["segments"],
            "additionalProperties": False,
        }
        if not strict:
            return schema
        op_defs = op_schema.pop("$defs", None)
        if op_defs:
            schema["$defs"] = op_defs
        return _to_strict_schema(schema)

json_schema classmethod

json_schema(*, strict: bool = False) -> dict[str, Any]

The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite.

Source code in src/videopython/ai/auto_edit/models.py
@classmethod
def json_schema(cls, *, strict: bool = False) -> dict[str, Any]:
    """The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite."""
    op_schema = Operation.json_schema()

    segment_schema: dict[str, Any] = {
        "type": "object",
        "description": PlanSegment.__doc__,
        "properties": {
            "scene_id": field_schema(PlanSegment, "scene_id"),
            "operations": array_field_schema(PlanSegment, "operations", op_schema),
            "transition_in": optional_model_field_schema(TransitionSpec, PlanSegment, "transition_in"),
        },
        "required": ["scene_id"],
        "additionalProperties": False,
    }
    segments = field_schema(cls, "segments")
    segments["items"] = segment_schema
    schema: dict[str, Any] = {
        "$schema": "http://json-schema.org/draft-07/schema#",
        "type": "object",
        "description": cls.__doc__,
        "properties": {
            "segments": segments,
            "post_operations": array_field_schema(cls, "post_operations", op_schema),
        },
        "required": ["segments"],
        "additionalProperties": False,
    }
    if not strict:
        return schema
    op_defs = op_schema.pop("$defs", None)
    if op_defs:
        schema["$defs"] = op_defs
    return _to_strict_schema(schema)

EditCatalog

Bases: BaseModel

The candidate scenes across all source videos.

Source code in src/videopython/ai/auto_edit/models.py
class EditCatalog(BaseModel):
    """The candidate scenes across all source videos."""

    scenes: list[CatalogScene]

    def by_id(self) -> dict[str, CatalogScene]:
        return {scene.id: scene for scene in self.scenes}

CatalogScene

Bases: BaseModel

A candidate scene the planner picks by id; carries the exact source bounds.

Source code in src/videopython/ai/auto_edit/models.py
class CatalogScene(BaseModel):
    """A candidate scene the planner picks by id; carries the exact source bounds."""

    id: str
    source: Path
    start: float
    end: float
    duration: float
    shot_type: str | None = None
    caption: str = ""
    transcript: str = ""
    has_speech: bool = False
    has_faces: bool = False

Errors

AutoEditError

Bases: AiError, RuntimeError

The planner could not produce a valid edit within the retry budget.

Source code in src/videopython/ai/auto_edit/editor.py
class AutoEditError(AiError, RuntimeError):
    """The planner could not produce a valid edit within the retry budget."""

PlannerError

Bases: AiError, RuntimeError

A backend produced unusable output; the editor retries (infra errors should propagate instead).

Source code in src/videopython/ai/auto_edit/backend.py
class PlannerError(AiError, RuntimeError):
    """A backend produced unusable output; the editor retries (infra errors should propagate instead)."""

UnknownSceneIdsError

Bases: AiError, ValueError

An EditPlan referenced scene ids absent from the catalog.

Source code in src/videopython/ai/auto_edit/resolve.py
class UnknownSceneIdsError(AiError, ValueError):
    """An EditPlan referenced scene ids absent from the catalog."""

    def __init__(self, ids: list[str]) -> None:
        self.ids = ids
        super().__init__(f"Plan references unknown scene ids: {sorted(set(ids))}")