nemo_curator.tasks.video

View as Markdown

Module Contents

Classes

NameDescription
ClipContainer for video clip data including metadata, frames, and processing results.
ClipStatsStatistics for video clips including filtering, transcoding, and captioning results.
VideoContainer for video content including metadata, frames, and processing results.
VideoMetadataMetadata for video content including dimensions, timing, and codec information.
VideoTaskTask for processing a single video.
_WindowContainer for video window data including metadata, frames, and processing results.

API

class nemo_curator.tasks.video.Clip(
uuid: uuid.UUID,
source_video: str,
span: tuple[float, float],
buffer: bytes | None = None,
extracted_frames: dict[str, numpy.typing.NDArray[numpy.uint8]] = dict(),
decoded_motion_data: None = None,
motion_score_global_mean: float | None = None,
motion_score_per_patch_min_256: float | None = None,
aesthetic_score: float | None = None,
cosmos_embed1_frames: numpy.typing.NDArray[numpy.float32] | None = None,
cosmos_embed1_embedding: numpy.typing.NDArray[numpy.float32] | None = None,
windows: list[nemo_curator.tasks.video._Window] = list(),
egomotion: dict[str, bytes] = dict(),
cosmos_embed1_text_match: tuple[str, float] | None = None,
errors: dict[str, str] = dict()
)
Dataclass

Container for video clip data including metadata, frames, and processing results.

This class stores information about a video segment, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

aesthetic_score
float | None = None
buffer
bytes | None = None
cosmos_embed1_embedding
NDArray[float32] | None = None
cosmos_embed1_frames
NDArray[float32] | None = None
cosmos_embed1_text_match
tuple[str, float] | None = None
decoded_motion_data
None = None
duration
float

Calculate the duration of the clip.

egomotion
dict[str, bytes] = field(default_factory=dict)
errors
dict[str, str] = field(default_factory=dict)
extracted_frames
dict[str, NDArray[uint8]] = field(default_factory=dict)
motion_score_global_mean
float | None = None
motion_score_per_patch_min_256
float | None = None
source_video
str
span
tuple[float, float]
uuid
UUID
windows
list[_Window] = field(default_factory=list)
nemo_curator.tasks.video.Clip.extract_metadata() -> dict[str, typing.Any] | None

Extract metadata from the clip’s buffer.

Returns: dict[str, Any] | None

A dictionary containing the extracted metadata (width, height, framerate,

Raises:

  • Exception: Any exception from extract_video_metadata is propagated.
nemo_curator.tasks.video.Clip.get_major_size() -> int

Calculate total memory size of the clip.

Returns: int

Total size in bytes.

class nemo_curator.tasks.video.ClipStats(
num_filtered_by_motion: int = 0,
num_filtered_by_aesthetic: int = 0,
num_passed: int = 0,
num_transcoded: int = 0,
num_with_embeddings: int = 0,
num_with_caption: int = 0,
num_with_webp: int = 0,
total_clip_duration: float = 0.0,
max_clip_duration: float = 0.0
)
Dataclass

Statistics for video clips including filtering, transcoding, and captioning results.

This class accumulates statistics about the number of clips processed through different stages of the video processing pipeline, including motion filtering, aesthetic filtering, and captioning.

max_clip_duration
float = 0.0
num_filtered_by_aesthetic
int = 0
num_filtered_by_motion
int = 0
num_passed
int = 0
num_transcoded
int = 0
num_with_caption
int = 0
num_with_embeddings
int = 0
num_with_webp
int = 0
total_clip_duration
float = 0.0

Combine two ClipStats objects.

Parameters:

other
ClipStats

ClipStats object to combine with.

class nemo_curator.tasks.video.Video(
input_video: pathlib.Path,
source_bytes: bytes | None = None,
metadata: nemo_curator.tasks.video.VideoMetadata = VideoMetadata(),
frame_array: numpy.typing.NDArray[numpy.uint8] | None = None,
clips: list[nemo_curator.tasks.video.Clip] = list(),
filtered_clips: list[nemo_curator.tasks.video.Clip] = list(),
num_total_clips: int = 0,
num_clip_chunks: int = 0,
clip_chunk_index: int = 0,
clip_stats: nemo_curator.tasks.video.ClipStats = ClipStats(),
errors: dict[str, str] = dict()
)
Dataclass

Container for video content including metadata, frames, and processing results.

This class stores information about a video segment, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

clip_chunk_index
int = 0
clip_stats
ClipStats = field(default_factory=ClipStats)
clips
list[Clip] = field(default_factory=list)
errors
dict[str, str] = field(default_factory=dict)
filtered_clips
list[Clip] = field(default_factory=list)
fraction
float

Calculate the fraction of processed clips.

frame_array
NDArray[uint8] | None = None
input_path
str

Get the input path of the video.

input_video
Path
metadata
VideoMetadata = field(default_factory=VideoMetadata)
num_clip_chunks
int = 0
num_total_clips
int = 0
source_bytes
bytes | None = None
weight
float

Calculate the weight of the video.

nemo_curator.tasks.video.Video.get_major_size() -> int

Calculate total memory size of the video.

Returns: int

Total size in bytes.

nemo_curator.tasks.video.Video.has_metadata() -> bool

Check if all metadata fields are present.

Returns: bool

True if all metadata fields are present, False otherwise.

nemo_curator.tasks.video.Video.is_10_bit_color() -> bool | None

Heuristic function to determine if the input video has 10-bit color.

nemo_curator.tasks.video.Video.populate_metadata() -> None

Extract and assign video metadata from source_bytes.

This method extracts metadata from the video data in source_bytes and assigns it to self.metadata.

Raises:

  • ValueError: If source_bytes is None.
  • Exception: Any exception from extract_video_metadata is propagated.
class nemo_curator.tasks.video.VideoMetadata(
size: int | None = None,
height: int | None = None,
width: int | None = None,
framerate: float | None = None,
num_frames: int | None = None,
duration: float | None = None,
video_codec: str | None = None,
pixel_format: str | None = None,
audio_codec: str | None = None,
bit_rate_k: int | None = None
)
Dataclass

Metadata for video content including dimensions, timing, and codec information.

This class stores essential video properties such as resolution, frame rate, duration, and encoding details.

audio_codec
str | None = None
bit_rate_k
int | None = None
duration
float | None = None
framerate
float | None = None
height
int | None = None
num_frames
int | None = None
pixel_format
str | None = None
size
int | None = None
video_codec
str | None = None
width
int | None = None
class nemo_curator.tasks.video.VideoTask(
task_id: str,
dataset_name: str,
data: nemo_curator.tasks.video.Video = Video(),
_stage_perf: list[nemo_curator.utils.performance_utils.StagePerfStats] = list(),
_metadata: dict[str, typing.Any] = dict()
)
Dataclass

Bases: Task[Video]

Task for processing a single video.

data
Video = field(default_factory=Video)
num_items
int

Get the number of items in this task.

nemo_curator.tasks.video.VideoTask.validate() -> bool

Validate the task data.

class nemo_curator.tasks.video._Window(
start_frame: int,
end_frame: int,
mp4_bytes: bytes | None = None,
qwen_llm_input: dict[str, typing.Any] | None = None,
x1_input: typing.Any | None = None,
caption: dict[str, str] = dict(),
enhanced_caption: dict[str, str] = dict(),
webp_bytes: bytes | None = None
)
Dataclass

Container for video window data including metadata, frames, and processing results.

This class stores information about a video window, including its source, timing, extracted frames, motion data, aesthetic scores, and generated captions.

caption
dict[str, str] = field(default_factory=dict)
end_frame
int
enhanced_caption
dict[str, str] = field(default_factory=dict)
mp4_bytes
bytes | None = None
qwen_llm_input
dict[str, Any] | None = None
start_frame
int
webp_bytes
bytes | None = None
x1_input
Any | None = None
nemo_curator.tasks.video._Window.get_major_size() -> int

Calculate total memory size of the window.

Returns: int

Total size in bytes.