AI Auto-Editing
LLM-authored editing: build a scene catalog from one or more sources and let a
local vision-language model plan a VideoEdit from it. See the
Automatic Editing guide for the end-to-end flow,
and the MCP server for the agent-driven variant.
Basic Usage
from videopython.ai import AutoEditor, OllamaVisionLLM
editor = AutoEditor(planner=OllamaVisionLLM(model="qwen3.6:27b"))
edit = editor.edit(["a.mp4", "b.mp4"], brief="A 15s teaser, most dynamic shot first.")
edit.run_to_file("teaser.mp4")
The catalog / by-id plan
build_catalog projects VideoAnalysis results into an EditCatalog of
candidate CatalogScenes (each with a stable id, exact bounds, caption,
transcript) plus one keyframe per scene. The planner authors an EditPlan whose
segments reference scenes by scene_id; resolve_plan maps those ids back to a
runnable VideoEdit. The model never authors timestamps.
from videopython.ai import VideoAnalyzer, build_catalog
from videopython.ai.auto_edit import EditPlan, resolve_plan
analyses = [VideoAnalyzer().analyze_path("a.mp4")]
bundle = build_catalog(analyses) # bundle.catalog + bundle.keyframes
plan = EditPlan.model_validate({"segments": [{"scene_id": bundle.catalog.scenes[0].id}]})
edit = resolve_plan(plan, bundle.catalog) # -> VideoEdit
Planner
AutoEditor
Plan a VideoEdit from source videos using a structured-vision planner.
Source code in src/videopython/ai/auto_edit/editor.py
| class AutoEditor:
"""Plan a VideoEdit from source videos using a structured-vision planner."""
def __init__(
self,
planner: StructuredVisionLLM,
*,
analyzer: VideoAnalyzer | None = None,
max_rounds: int = 3,
normalize_target: _NormalizeTarget = "largest",
) -> None:
self.planner = planner
self._analyzer = analyzer
self.max_rounds = max_rounds
self.normalize_target = normalize_target
def edit(self, sources: Sequence[str | Path], brief: str, *, context: dict[str, Any] | None = None) -> VideoEdit:
"""Analyze ``sources`` and plan an edit for ``brief`` (runs the analyzer)."""
analyses = [self._get_analyzer().analyze_path(source) for source in sources]
return self.edit_from_analyses(analyses, brief, context=context)
def edit_from_analyses(
self, analyses: Sequence[VideoAnalysis], brief: str, *, context: dict[str, Any] | None = None
) -> VideoEdit:
"""Plan an edit from precomputed VideoAnalysis results (no model download)."""
bundle = build_catalog(analyses)
metadata = _metadata_by_source(analyses)
run_context = _merge_context(analyses, context)
schema = EditPlan.json_schema(strict=True)
base_text, images = _build_prompt(brief, bundle)
feedback: str | None = None
for _ in range(self.max_rounds):
text = base_text if feedback is None else f"{base_text}\n\n{feedback}"
try:
raw = self.planner.generate_json(system=_SYSTEM_PROMPT, text=text, images=images or None, schema=schema)
edit = resolve_plan(EditPlan.model_validate(raw), bundle.catalog)
except (PlannerError, ValidationError, UnknownSceneIdsError) as exc:
feedback = _shape_feedback(exc)
continue
edit, _ = edit.repair(metadata, context=run_context, clamp_segment_end=True)
edit, _ = edit.normalize_dimensions(metadata, self.normalize_target, context=run_context)
errors = edit.check(metadata, context=run_context)
if not errors:
return edit
feedback = "The previous plan had these problems:\n" + "\n".join(e.to_prompt_line() for e in errors)
raise AutoEditError(f"No valid edit after {self.max_rounds} round(s). Last feedback:\n{feedback}")
def _get_analyzer(self) -> VideoAnalyzer:
if self._analyzer is None:
from videopython.ai.video_analysis import VideoAnalyzer
self._analyzer = VideoAnalyzer()
return self._analyzer
|
edit
edit(
sources: Sequence[str | Path],
brief: str,
*,
context: dict[str, Any] | None = None,
) -> VideoEdit
Analyze sources and plan an edit for brief (runs the analyzer).
Source code in src/videopython/ai/auto_edit/editor.py
| def edit(self, sources: Sequence[str | Path], brief: str, *, context: dict[str, Any] | None = None) -> VideoEdit:
"""Analyze ``sources`` and plan an edit for ``brief`` (runs the analyzer)."""
analyses = [self._get_analyzer().analyze_path(source) for source in sources]
return self.edit_from_analyses(analyses, brief, context=context)
|
edit_from_analyses
edit_from_analyses(
analyses: Sequence[VideoAnalysis],
brief: str,
*,
context: dict[str, Any] | None = None,
) -> VideoEdit
Plan an edit from precomputed VideoAnalysis results (no model download).
Source code in src/videopython/ai/auto_edit/editor.py
| def edit_from_analyses(
self, analyses: Sequence[VideoAnalysis], brief: str, *, context: dict[str, Any] | None = None
) -> VideoEdit:
"""Plan an edit from precomputed VideoAnalysis results (no model download)."""
bundle = build_catalog(analyses)
metadata = _metadata_by_source(analyses)
run_context = _merge_context(analyses, context)
schema = EditPlan.json_schema(strict=True)
base_text, images = _build_prompt(brief, bundle)
feedback: str | None = None
for _ in range(self.max_rounds):
text = base_text if feedback is None else f"{base_text}\n\n{feedback}"
try:
raw = self.planner.generate_json(system=_SYSTEM_PROMPT, text=text, images=images or None, schema=schema)
edit = resolve_plan(EditPlan.model_validate(raw), bundle.catalog)
except (PlannerError, ValidationError, UnknownSceneIdsError) as exc:
feedback = _shape_feedback(exc)
continue
edit, _ = edit.repair(metadata, context=run_context, clamp_segment_end=True)
edit, _ = edit.normalize_dimensions(metadata, self.normalize_target, context=run_context)
errors = edit.check(metadata, context=run_context)
if not errors:
return edit
feedback = "The previous plan had these problems:\n" + "\n".join(e.to_prompt_line() for e in errors)
raise AutoEditError(f"No valid edit after {self.max_rounds} round(s). Last feedback:\n{feedback}")
|
OllamaVisionLLM
A StructuredVisionLLM backed by a local Ollama server.
The model must be vision-capable (it is sent keyframes) AND support Ollama's
structured-output format (the EditPlan schema constrains the decode). The
default qwen3.6:27b is an Apache-2.0 vision model; not every model supports
schema conditioning (some builds, e.g. certain MLX ones, fail it), so confirm
format works for a custom model locally. ollama pull <model> first;
options are extra generation options merged over temperature=0.
Thin wrapper over the shared :class:OllamaStructuredClient: its only job is
to translate :class:OllamaError into the :class:PlannerError the editor
retries on.
Source code in src/videopython/ai/auto_edit/local.py
| class OllamaVisionLLM:
"""A StructuredVisionLLM backed by a local Ollama server.
The model must be vision-capable (it is sent keyframes) AND support Ollama's
structured-output ``format`` (the EditPlan schema constrains the decode). The
default ``qwen3.6:27b`` is an Apache-2.0 vision model; not every model supports
schema conditioning (some builds, e.g. certain MLX ones, fail it), so confirm
``format`` works for a custom model locally. ``ollama pull <model>`` first;
``options`` are extra generation options merged over ``temperature=0``.
Thin wrapper over the shared :class:`OllamaStructuredClient`: its only job is
to translate :class:`OllamaError` into the :class:`PlannerError` the editor
retries on.
"""
def __init__(
self,
model: str = DEFAULT_OLLAMA_MODEL,
*,
host: str | None = None,
options: dict[str, Any] | None = None,
) -> None:
self._client = OllamaStructuredClient(model=model, host=host, options=options)
def generate_json(
self, *, system: str, text: str, images: list[np.ndarray] | None, schema: dict[str, Any]
) -> dict[str, Any]:
try:
return self._client.generate_json(system=system, text=text, schema=schema, images=images or None)
except OllamaError as exc:
raise PlannerError(str(exc)) from exc
|
StructuredVisionLLM
Bases: Protocol
Returns schema-shaped JSON from a system prompt + text + optional keyframes.
The signature mirrors
:meth:videopython.ai._ollama.OllamaStructuredClient.generate_json, so any
structured-generation client satisfies it structurally. Implementations
raise :class:PlannerError on unusable output (the editor retries those);
infra errors should propagate so they are not silently retried.
Source code in src/videopython/ai/auto_edit/backend.py
| @runtime_checkable
class StructuredVisionLLM(Protocol):
"""Returns schema-shaped JSON from a system prompt + text + optional keyframes.
The signature mirrors
:meth:`videopython.ai._ollama.OllamaStructuredClient.generate_json`, so any
structured-generation client satisfies it structurally. Implementations
raise :class:`PlannerError` on unusable output (the editor retries those);
infra errors should propagate so they are not silently retried.
"""
def generate_json(
self, *, system: str, text: str, images: list[np.ndarray] | None, schema: dict[str, Any]
) -> dict[str, Any]: ...
|
Catalog & plan
build_catalog
build_catalog(
analyses: Sequence[VideoAnalysis],
*,
keyframes: bool = True,
max_transcript_chars: int = DEFAULT_TRANSCRIPT_CHARS,
) -> CatalogBundle
Project VideoAnalysis results into a flat scene catalog with one midpoint keyframe per scene.
Source code in src/videopython/ai/auto_edit/catalog.py
| def build_catalog(
analyses: Sequence[VideoAnalysis],
*,
keyframes: bool = True,
max_transcript_chars: int = DEFAULT_TRANSCRIPT_CHARS,
) -> CatalogBundle:
"""Project VideoAnalysis results into a flat scene catalog with one midpoint keyframe per scene."""
scenes: list[CatalogScene] = []
frames: dict[str, np.ndarray] = {}
used_ids: set[str] = set()
for analysis in analyses:
source_path = analysis.source.path
samples = analysis.scenes.samples if analysis.scenes else []
transcription = analysis.audio.transcription if analysis.audio else None
stem = Path(source_path).stem if source_path else "clip"
for sample in samples:
scene_id = _unique_id(stem, sample.scene_index, used_ids)
caption, shot_type = _description(sample)
transcript = _transcript_excerpt(
transcription, sample.start_second, sample.end_second, max_transcript_chars
)
scenes.append(
CatalogScene(
id=scene_id,
source=Path(source_path) if source_path else Path(stem),
start=sample.start_second,
end=sample.end_second,
duration=max(0.0, sample.end_second - sample.start_second),
shot_type=shot_type,
caption=caption,
transcript=transcript,
has_speech=bool(transcript),
has_faces=bool(sample.faces),
)
)
if keyframes:
if source_path is None:
raise ValueError(f"Scene {scene_id!r} has no source path to extract a keyframe from.")
midpoint = (sample.start_second + sample.end_second) / 2.0
frames[scene_id] = extract_frames_at_times(source_path, [midpoint])[0]
return CatalogBundle(catalog=EditCatalog(scenes=scenes), keyframes=frames)
|
resolve_plan
resolve_plan(
plan: EditPlan, catalog: EditCatalog
) -> VideoEdit
Map each plan segment's scene_id to its exact source/start/end.
Source code in src/videopython/ai/auto_edit/resolve.py
| def resolve_plan(plan: EditPlan, catalog: EditCatalog) -> VideoEdit:
"""Map each plan segment's scene_id to its exact source/start/end."""
by_id = catalog.by_id()
unknown = [seg.scene_id for seg in plan.segments if seg.scene_id not in by_id]
if unknown:
raise UnknownSceneIdsError(unknown)
segments = [
SegmentConfig(
source=by_id[seg.scene_id].source,
start=by_id[seg.scene_id].start,
end=by_id[seg.scene_id].end,
operations=seg.operations,
transition_in=seg.transition_in,
)
for seg in plan.segments
]
return VideoEdit(segments=segments, post_operations=plan.post_operations)
|
EditPlan
Bases: BaseModel
The planner's output: an ordered selection of catalog scenes, referenced by id.
Source code in src/videopython/ai/auto_edit/models.py
| class EditPlan(BaseModel):
"""The planner's output: an ordered selection of catalog scenes, referenced by id."""
model_config = ConfigDict(extra="forbid")
segments: list[PlanSegment] = Field(min_length=1, description="Ordered plan segments.")
post_operations: list[OperationInput] = Field(
default_factory=list, description="Operations applied once to the whole assembled program."
)
@classmethod
def json_schema(cls, *, strict: bool = False) -> dict[str, Any]:
"""The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite."""
op_schema = Operation.json_schema()
segment_schema: dict[str, Any] = {
"type": "object",
"description": PlanSegment.__doc__,
"properties": {
"scene_id": field_schema(PlanSegment, "scene_id"),
"operations": array_field_schema(PlanSegment, "operations", op_schema),
"transition_in": optional_model_field_schema(TransitionSpec, PlanSegment, "transition_in"),
},
"required": ["scene_id"],
"additionalProperties": False,
}
segments = field_schema(cls, "segments")
segments["items"] = segment_schema
schema: dict[str, Any] = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"description": cls.__doc__,
"properties": {
"segments": segments,
"post_operations": array_field_schema(cls, "post_operations", op_schema),
},
"required": ["segments"],
"additionalProperties": False,
}
if not strict:
return schema
op_defs = op_schema.pop("$defs", None)
if op_defs:
schema["$defs"] = op_defs
return _to_strict_schema(schema)
|
json_schema
classmethod
json_schema(*, strict: bool = False) -> dict[str, Any]
The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite.
Source code in src/videopython/ai/auto_edit/models.py
| @classmethod
def json_schema(cls, *, strict: bool = False) -> dict[str, Any]:
"""The by-id mirror of VideoEdit.json_schema, reusing the op union and strict rewrite."""
op_schema = Operation.json_schema()
segment_schema: dict[str, Any] = {
"type": "object",
"description": PlanSegment.__doc__,
"properties": {
"scene_id": field_schema(PlanSegment, "scene_id"),
"operations": array_field_schema(PlanSegment, "operations", op_schema),
"transition_in": optional_model_field_schema(TransitionSpec, PlanSegment, "transition_in"),
},
"required": ["scene_id"],
"additionalProperties": False,
}
segments = field_schema(cls, "segments")
segments["items"] = segment_schema
schema: dict[str, Any] = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"description": cls.__doc__,
"properties": {
"segments": segments,
"post_operations": array_field_schema(cls, "post_operations", op_schema),
},
"required": ["segments"],
"additionalProperties": False,
}
if not strict:
return schema
op_defs = op_schema.pop("$defs", None)
if op_defs:
schema["$defs"] = op_defs
return _to_strict_schema(schema)
|
EditCatalog
Bases: BaseModel
The candidate scenes across all source videos.
Source code in src/videopython/ai/auto_edit/models.py
| class EditCatalog(BaseModel):
"""The candidate scenes across all source videos."""
scenes: list[CatalogScene]
def by_id(self) -> dict[str, CatalogScene]:
return {scene.id: scene for scene in self.scenes}
|
CatalogScene
Bases: BaseModel
A candidate scene the planner picks by id; carries the exact source bounds.
Source code in src/videopython/ai/auto_edit/models.py
| class CatalogScene(BaseModel):
"""A candidate scene the planner picks by id; carries the exact source bounds."""
id: str
source: Path
start: float
end: float
duration: float
shot_type: str | None = None
caption: str = ""
transcript: str = ""
has_speech: bool = False
has_faces: bool = False
|
Errors
AutoEditError
Bases: AiError, RuntimeError
The planner could not produce a valid edit within the retry budget.
Source code in src/videopython/ai/auto_edit/editor.py
| class AutoEditError(AiError, RuntimeError):
"""The planner could not produce a valid edit within the retry budget."""
|
PlannerError
Bases: AiError, RuntimeError
A backend produced unusable output; the editor retries (infra errors should propagate instead).
Source code in src/videopython/ai/auto_edit/backend.py
| class PlannerError(AiError, RuntimeError):
"""A backend produced unusable output; the editor retries (infra errors should propagate instead)."""
|
UnknownSceneIdsError
Bases: AiError, ValueError
An EditPlan referenced scene ids absent from the catalog.
Source code in src/videopython/ai/auto_edit/resolve.py
| class UnknownSceneIdsError(AiError, ValueError):
"""An EditPlan referenced scene ids absent from the catalog."""
def __init__(self, ids: list[str]) -> None:
self.ids = ids
super().__init__(f"Plan references unknown scene ids: {sorted(set(ids))}")
|