nemo_curator.tasks.video

Module Contents

Classes

Name	Description
`Clip`	Container for video clip data including metadata, frames, and processing results.
`ClipStats`	Statistics for video clips including filtering, transcoding, and captioning results.
`Video`	Container for video content including metadata, frames, and processing results.
`VideoMetadata`	Metadata for video content including dimensions, timing, and codec information.
`VideoTask`	Task for processing a single video.
`_Window`	Container for video window data including metadata, frames, and processing results.

API

class nemo_curator.tasks.video.Clip(
    uuid: uuid.UUID,
    source_video: str,
    span: tuple[float, float],
    buffer: bytes | None = None,
    extracted_frames: dict[str, numpy.typing.NDArray[numpy.uint8]] = dict(),
    decoded_motion_data: None = None,
    motion_score_global_mean: float | None = None,
    motion_score_per_patch_min_256: float | None = None,
    aesthetic_score: float | None = None,
    cosmos_embed1_frames: numpy.typing.NDArray[numpy.float32] | None = None,
    cosmos_embed1_embedding: numpy.typing.NDArray[numpy.float32] | None = None,
    windows: list[nemo_curator.tasks.video._Window] = list(),
    egomotion: dict[str, bytes] = dict(),
    cosmos_embed1_text_match: tuple[str, float] | None = None,
    errors: dict[str, str] = dict()
)

Dataclass

Container for video clip data including metadata, frames, and processing results.

This class stores information about a video segment, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

aesthetic_score

float | None = None

buffer

bytes | None = None

cosmos_embed1_embedding

NDArray[float32] | None = None

cosmos_embed1_frames

NDArray[float32] | None = None

cosmos_embed1_text_match

tuple[str, float] | None = None

decoded_motion_data

None = None

duration

float

Calculate the duration of the clip.

egomotion

dict[str, bytes] = field(default_factory=dict)

errors

dict[str, str] = field(default_factory=dict)

extracted_frames

dict[str, NDArray[uint8]] = field(default_factory=dict)

motion_score_global_mean

float | None = None

motion_score_per_patch_min_256

float | None = None

source_video

str

span

tuple[float, float]

uuid

UUID

windows

list[_Window] = field(default_factory=list)

nemo_curator.tasks.video.Clip.extract_metadata() -> dict[str, typing.Any] | None

Extract metadata from the clip’s buffer.

Returns: dict[str, Any] | None

A dictionary containing the extracted metadata (width, height, framerate,

Raises:

Exception: Any exception from extract_video_metadata is propagated.

nemo_curator.tasks.video.Clip.get_major_size() -> int

Calculate total memory size of the clip.

Returns: int

Total size in bytes.

class nemo_curator.tasks.video.ClipStats(
    num_filtered_by_motion: int = 0,
    num_filtered_by_aesthetic: int = 0,
    num_passed: int = 0,
    num_transcoded: int = 0,
    num_with_embeddings: int = 0,
    num_with_caption: int = 0,
    num_with_webp: int = 0,
    total_clip_duration: float = 0.0,
    max_clip_duration: float = 0.0
)

Dataclass

Statistics for video clips including filtering, transcoding, and captioning results.

This class accumulates statistics about the number of clips processed through different stages of the video processing pipeline, including motion filtering, aesthetic filtering, and captioning.

max_clip_duration

float = 0.0

num_filtered_by_aesthetic

int = 0

num_filtered_by_motion

int = 0

num_passed

int = 0

num_transcoded

int = 0

num_with_caption

int = 0

num_with_embeddings

int = 0

num_with_webp

int = 0

total_clip_duration

float = 0.0

nemo_curator.tasks.video.ClipStats.combine(
    other: nemo_curator.tasks.video.ClipStats
) -> None

Combine two ClipStats objects.

Parameters:

other

ClipStats

ClipStats object to combine with.

class nemo_curator.tasks.video.Video(
    input_video: pathlib.Path,
    source_bytes: bytes | None = None,
    metadata: nemo_curator.tasks.video.VideoMetadata = VideoMetadata(),
    frame_array: numpy.typing.NDArray[numpy.uint8] | None = None,
    clips: list[nemo_curator.tasks.video.Clip] = list(),
    filtered_clips: list[nemo_curator.tasks.video.Clip] = list(),
    num_total_clips: int = 0,
    num_clip_chunks: int = 0,
    clip_chunk_index: int = 0,
    clip_stats: nemo_curator.tasks.video.ClipStats = ClipStats(),
    errors: dict[str, str] = dict()
)

Dataclass

Container for video content including metadata, frames, and processing results.

This class stores information about a video segment, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

clip_chunk_index

int = 0

clip_stats

ClipStats = field(default_factory=ClipStats)

clips

list[Clip] = field(default_factory=list)

errors

dict[str, str] = field(default_factory=dict)

filtered_clips

list[Clip] = field(default_factory=list)

fraction

float

Calculate the fraction of processed clips.

frame_array

NDArray[uint8] | None = None

input_path

str

Get the input path of the video.

input_video

Path

metadata

VideoMetadata = field(default_factory=VideoMetadata)

num_clip_chunks

int = 0

num_total_clips

int = 0

source_bytes

bytes | None = None

weight

float

Calculate the weight of the video.

nemo_curator.tasks.video.Video.get_major_size() -> int

Calculate total memory size of the video.

Returns: int

Total size in bytes.

nemo_curator.tasks.video.Video.has_metadata() -> bool

Check if all metadata fields are present.

Returns: bool

True if all metadata fields are present, False otherwise.

nemo_curator.tasks.video.Video.is_10_bit_color() -> bool | None

Heuristic function to determine if the input video has 10-bit color.

nemo_curator.tasks.video.Video.populate_metadata() -> None

Extract and assign video metadata from source_bytes.

This method extracts metadata from the video data in source_bytes and assigns it to self.metadata.

Raises:

ValueError: If source_bytes is None.
Exception: Any exception from extract_video_metadata is propagated.

class nemo_curator.tasks.video.VideoMetadata(
    size: int | None = None,
    height: int | None = None,
    width: int | None = None,
    framerate: float | None = None,
    num_frames: int | None = None,
    duration: float | None = None,
    video_codec: str | None = None,
    pixel_format: str | None = None,
    audio_codec: str | None = None,
    bit_rate_k: int | None = None
)

Dataclass

Metadata for video content including dimensions, timing, and codec information.

This class stores essential video properties such as resolution, frame rate, duration, and encoding details.

audio_codec

str | None = None

bit_rate_k

int | None = None

duration

float | None = None

framerate

float | None = None

height

int | None = None

num_frames

int | None = None

pixel_format

str | None = None

size

int | None = None

video_codec

str | None = None

width

int | None = None

class nemo_curator.tasks.video.VideoTask(
    task_id: str,
    dataset_name: str,
    data: nemo_curator.tasks.video.Video = Video(),
    _stage_perf: list[nemo_curator.utils.performance_utils.StagePerfStats] = list(),
    _metadata: dict[str, typing.Any] = dict()
)

Dataclass

Bases: Task[Video]

Task for processing a single video.

data

Video = field(default_factory=Video)

num_items

int

Get the number of items in this task.

nemo_curator.tasks.video.VideoTask.validate() -> bool

Validate the task data.

class nemo_curator.tasks.video._Window(
    start_frame: int,
    end_frame: int,
    mp4_bytes: bytes | None = None,
    llm_inputs: dict[str, dict[str, typing.Any]] = dict(),
    x1_input: typing.Any | None = None,
    caption: dict[str, str] = dict(),
    enhanced_caption: dict[str, str] = dict(),
    webp_bytes: bytes | None = None
)

Dataclass

Container for video window data including metadata, frames, and processing results.

This class stores information about a video window, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

caption

dict[str, str] = field(default_factory=dict)

end_frame

int

enhanced_caption

dict[str, str] = field(default_factory=dict)

llm_inputs

dict[str, dict[str, Any]] = field(default_factory=dict)

mp4_bytes

bytes | None = None

start_frame

int

webp_bytes

bytes | None = None

x1_input

Any | None = None

nemo_curator.tasks.video._Window.get_major_size() -> int

Calculate total memory size of the window.

Returns: int

Total size in bytes.