Protobuf Schema

The protobuf schema are defined using the two files as described below.

  • schema.proto contains the Frames definition which is used by perception to send metadata

  • ext.proto contains the Behavior definition which is used the by behavior analytics


syntax = "proto3";

package nv;

import "google/protobuf/timestamp.proto";

represent metadata in a video frame
message Frame {

    string version = 1;
    frame id
    string id = 2;

    timestamp in UTC format
    google.protobuf.Timestamp timestamp = 3;

    sensor id
    string sensorId = 4;

    sequence of detected objects
    repeated Object objects = 5;

represents detected object in a given frame, most of the attributes in an Object are optional
and is dictated by the inference model pipeline. Object id, bbox, type and confidence is muust have, rest is optional
message Object{
    object ID
    string id = 1;

    detection bbox
    Bbox bbox = 2;

    type of object, example Person, Vehicle, Face
    string type = 3;

    detection confidence
    float confidence = 4;

    map of attributes, name:value pair. example secondary attributes for vehicle would are color, make etc
    when a person is detected it will height, eye-color etc.
    map<string, string> info = 5;

    object appearance vector or embedding
    Embedding embedding = 6;

    object pose, primarily provides array of keypoints
    Pose pose = 7;

    gaze, direction and angle of gaze
    Gaze gaze = 8;

    lip activity like speaking, silent
    LipActivity lipActivity = 9;

    speed of object
    float speed = 10;

    in direction x, y, z
    repeated float dir = 11;


    cartesian coordinate
    Coordinate coordinate = 12;

    lat, lon, alt
    Location location = 13;


cartesian coordinates
message Coordinate{
    double x = 1;
    double y = 2;
    double z = 3;


geo location
message Location{
    double lat = 1;
    double lon = 2;
    double alt = 3;
top left corner + bottom right corner
message Bbox {
    float leftX = 1;
    float  topY = 2;
    float rightX = 3;
    float bottomY = 4;

The type represent the  nature of the pose,
i.e., pose2D, pose25D or pose3D,

which is followed by several Keypoints or bodyparts.
Each Keypoint comprises of


    -coordinate (x,y,z, confidence)

    -quaternion (qx,qy,qz,qw), quaternion values are optional.

    - e.g., |left-eye|x,y,z,0.75|qx,qy,qz,qw|.

-For pose3D, the coordinates (x,y,z) are in the world coordinate system with
respect to the camera (unit: mm).

-For pose2.5D, it shares the same format as pose3D,
however, the coordinates (x,y) are in the image plane (unit: pixel) and the z coordinate
stands for the metric depth relative to the root keypoint, i.e., pelvis (unit: mm).

-For pose2D, (x,y) are the image pixel coordinates and the z coordinate is ignored, e.g., |right-ear,x,y,|.

message Pose{

    pose2D or pose3D or pose25D
    string type = 1;

    array of keypoints
    repeated Keypoint keypoints = 2;

    array of pose actons with their probabilities or confidence
    repeated Action actions = 3;

    Keypoint + quaternion
    message Keypoint{
        //name of keypoint
        string name = 1;

        // Array comprising of (x,y,z, confidence)
        repeated float coordinates = 2;

        //Array comprising of (qx,qy,qz,qw)
        repeated float quaternion = 3;


Pose Action, like walking, running
message Action{
    type of action, standing, running etc
    string type = 1;

    confidence of actions
    float confidence = 2;

Gaze point of reference  x,y,z are in the camera coordinate system.
theta, phi are angles
message Gaze{

    float x = 1;
    float y = 2;
    float z = 3;
    float theta = 4;
    float phi = 5;


activity like silent speaking
message LipActivity{
    string classLabel = 1;

event like moving, parked, etc
message Event{
    string id = 1;
    string type = 2;

    optional attributes
    map<string, string> info = 5;

* Analytics Module to be nested in every message, example below
* {{{
* "analyticsModule": {
*                   "id": "module-id",
*                   "description": "Vehicle Detection and License Plate Recognition",
*                   "source": "OpenALR",
*                   "version": "3.0"
* }
* }}}

message  AnalyticsModule{
    string id = 1;
    string description = 2;
    string source = 3;
    string version = 4;
    additional info
    map<string, string> info = 5;


* Sensor object to be nested in every message, example
* {{{
* "sensor": {
*           "id": "string",
*           "type": "Camera/Puck",
*           "location": {
*                     "lat": 45.99,
*                     "lon": 35.54,
*                     "alt": 79.03
*       },
*           "coordinate": {
*                      "x": 5.2,
*                     "y": 10.1,
*                     "z": 11.2
*      },
*      "description": "Entrance of Endeavor Garage Right Lane"
* }
* }}}
message Sensor{
    string id = 1;
    Camera or Puck
    string type = 2;

    string description=3;
    in lat, lon, alt
    Location location = 4;
    in x,y,z
    Coordinate coordinate = 5;
    additional info
    map<string, string> info = 6;


* Describing a scene needs describing where the scene is happening – place,
* what is happening in terms of events, and who are the objects participating in the event.
* note coordinate(x,y,z) are in meters
* JSON representation of Place
* {{{
        "place": {
            "id": "string",
            "name": "endeavor",
            "type": "building/garage",
            "location": {
                "lat": 37.37060687475246,
                "lon": -121.9672466762127,
                "alt": 0.00
* }}}
message Place{
    string id = 1;
    string name = 2;
    Parking Lot or Entrance or Room
    string type = 3;
    in lat, lon, alt
    Location location = 4;
    in x,y,z
    Coordinate coordinate = 5;
    additional info, example details of park lot
    map<string, string> info = 6;


represents a single object detection
message Message{
    string messageid = 1;
    string mdsversion = 2;
    google.protobuf.Timestamp  timestamp = 3;
    Place place = 4;
    Sensor          sensor = 5;
    AnalyticsModule analyticsModule = 6;
    Object       object = 7;
    Event           event = 8;
    string videoPath = 9;

Object Embedding
message Embedding{

    object appearance vector
    repeated float vector = 1 [packed = true];

    map<string, string> info = 2;


An example of json frame object which follows the above protobuf schema can be found below

        "version": "4.0",
        "timestamp": "2023-10-30T22:30:30.000Z",
        "id": "1",
        "sensorId": "Sensor_1",
        "objects": [
                "dir": [],
                "lipActivity": null,
                "info": {
                    "visibility": "0.974372",
                    "footLocation2D": "899.152,316.812",
                    "footLocation3D": "9.85312,-12.53092",
                    "convexHull": "10,-4,23,-19,7,20,-18,26"
                "coordinate": null,
                "speed": 0,
                "bbox": {
                    "topY": 107.74882507324219,
                    "rightX": 1116.9134521484375,
                    "bottomY": 311.5032653808594,
                    "leftX": 1063.572021484375
                "embedding": {
                    "info": {},
                    "vector": [
                "location": null,
                "gaze": null,
                "id": "1",
                "confidence": 0.9556514024734497,
                "pose": null,
                "type": "Person"
                "dir": [],
                "lipActivity": null,
                "info": {
                    "visibility": "0.991698",
                    "footLocation2D": "297.655,179.069",
                    "footLocation3D": "17.56687,20.29478",
                    "convexHull": "-19,-33,-18,-32,3,35"
                "coordinate": null,
                "speed": 0,
                "bbox": {
                    "topY": 115.80743408203125,
                    "rightX": 861.8057861328125,
                    "bottomY": 328.8199768066406,
                    "leftX": 814.3302001953125
                "embedding": {
                    "info": {},
                    "vector": [
                "location": null,
                "gaze": null,
                "id": "2",
                "confidence": 0.9136765003204346,
                "pose": null,
                "type": "Person"


syntax = "proto3";

package nv;

option java_package = "nv.schema";

import "google/protobuf/timestamp.proto";
import "schema.proto";

represent enhanced  frame
message FrameMessage {

    string version = 1;
    frame id
    string id = 2;

    timestamp in UTC format
    google.protobuf.Timestamp timestamp = 3;

    sensor id
    string sensorId = 4;

    sequence of detected objects
    repeated Object objects = 5;

    FOV based object count
    repeated TypeCount fov = 6;

    ROI based object count
    repeated TypeCount rois = 7;

    SD socialDistancing = 8;

    addition information, that may be needed by and application
    map<string, string> info = 16;


Object Type and corresponding count
message TypeCount{
    roi or fov id
    string id = 1;
    type of object
    string type = 2;
    count of object
    int32 count = 3;

2D point, represented using x,y coordinates
message Point2D{
    double x = 1;
    double y = 2;

Cluster, array of 2D points
message Cluster{
    repeated Point2D points = 1;

social distancing
message SD{
    cluster threshold
    double threshold = 1;
    number of clusters violating social distancing
    int32 proximityDetections = 2;
    2D point clusters
    repeated Cluster clusters = 3;

* represents locations array, for geo coodinates Array[Array[lon, lat]], for cartesian coordinates Array[Array[x,y]]
message GeoLocation{
    string type = 1;

    message Point{
        repeated double point = 1;
    Array of points
    repeated Point coordinates = 2;

    //repeated google.protobuf.ListValue coordinates = 2;
* represent object Behavior, and comprise of attributes from object movement + [[Object]] appearance
* with highest confidence + [[Sensor]] + [[Place]]
*  + [[Pose]] + [[LipActivity]] + [[Gaze]]
* the JSON representation of this object Behavior message  is stored in persistent store like Elasticsearch for search and indexing
message Behavior{
    * object id, usually a combination of sensorId + objectId
    string id = 1;

    * start timestamp
    google.protobuf.Timestamp timestamp = 2;

    * end timestamp
    google.protobuf.Timestamp end = 3;

    // start timestamp

    //google.protobuf.Timestamp timestamp = 4;

    * representing array of [lon,lat] or [x,y]
    GeoLocation locations = 5;

    * representing smoothened array of [lon,lat] or [x,y]
    GeoLocation smoothLocations = 6;

    * road network edges, only applicable for geo coordinates
    repeated string edges = 7;

    * distance traveled in meters
    double distance = 8;

    * avg speed in mph
    double speed = 9;

    * speed over a period of the trajectory
    repeated double speedOverTime = 10;

    * timeInterval, end time - start time
    double timeInterval = 11;

    * bearing angle
    double bearing = 12;

    * N, S, E or W based on bearing
    string direction = 13;

    * number of raw coordinates
    int32 length = 14;

    * where object was seen by the sensor
    Place place = 15;

    * sensor details
    Sensor sensor = 16;

    * AnalyticsModule
    AnalyticsModule analyticsModule = 17;

    * object details, associated with this trajectory
    Object object = 18;

    * event details
    Event event = 19;

    * video url if available
    string videoPath = 20;

    * Array of pose
    repeated Pose poses = 21;

    * Array of LipActivity
    repeated LipActivity lipActivities = 22;

    Array of Gaze
    repeated Gaze gazes = 23;

    Array of
    repeated Embedding embeddings = 24;

    addition information, that may be needed by and application
    map<string, string> info = 25;


An example of json behavior object which follows the above protobuf schema can be found below

        "analyticsModule": {
            "description": "",
            "info": {},
            "version": "",
            "id": "",
            "source": ""
        "end": "2023-11-11T09:50:31.252Z",
        "object": {
            "bbox": {
                "bottomY": 144,
                "leftX": 1625,
                "topY": 72,
                "rightX": 1664
            "location": {
                "lat": -0.0002441081085205078,
                "lon": -0.00026202797699034064,
                "alt": 0
            "info": {
                "visibility": "0.991698",
                "footLocation2D": "297.655,179.069",
                "footLocation3D": "17.56687,20.29478",
                "convexHull": "-19,-33,-18,-32,3,35"
            "coordinate": {
                "z": 0,
                "y": 27.123123168945312,
                "x": 29.114219665527344
            "id": "1052",
            "confidence": 0.9485750198364258,
            "pose": null,
            "gaze": null,
            "type": "Person",
            "lipActivity": null,
            "speed": 0,
            "dir": [],
            "embedding": null
        "timestamp": "2023-11-11T09:50:31.218Z",
        "id": "Sensor_1 #-# 1052",
        "edges": [],
        "timeInterval": 0.034,
        "bearing": 239.8590251375903,
        "poses": [],
        "place": {
            "location": null,
            "name": "building=Building_1",
            "type": "",
            "coordinate": null,
            "info": {},
            "id": ""
        "locations": {
            "type": "linestring",
            "coordinates": [
        "info": {},
        "distance": 0.1707321118533282,
        "videoPath": "frameId-10696",
        "direction": "Down",
        "gazes": [],
        "speed": 11.232845266702702,
        "event": null,
        "smoothLocations": {
            "type": "linestring",
            "coordinates": [
        "length": 2,
        "sensor": {
            "description": "Sensor_1",
            "location": null,
            "type": "",
            "coordinate": null,
            "info": {},
            "id": "Sensor_1"
        "lipActivities": [],
        "speedOverTime": [