Protobuf Schema

The protobuf schema are defined using the two files as described below.

  • schema.proto contains the Frames definition which is used by perception to send metadata

  • ext.proto contains the Behavior definition which is used the by behavior analytics

schema.proto

syntax = "proto3";

package nv;

import "google/protobuf/timestamp.proto";

/*
represent metadata in a video frame
*/
message Frame {

    /*
    version
    */
    string version = 1;
    /*
    frame id
    */
    string id = 2;

    /*
    timestamp in UTC format
    */
    google.protobuf.Timestamp timestamp = 3;


    /*
    sensor id
    */
    string sensorId = 4;


    /*
    sequence of detected objects
    */
    repeated Object objects = 5;
}

/*
represents detected object in a given frame, most of the attributes in an Object are optional
and is dictated by the inference model pipeline. Object id, bbox, type and confidence is muust have, rest is optional
*/
message Object{
    /*
    object ID
    */
    string id = 1;


    /*
    detection bbox
    */
    Bbox bbox = 2;

    /*
    type of object, example Person, Vehicle, Face
    */
    string type = 3;

    /*
    detection confidence
    */
    float confidence = 4;

    /*
    map of attributes, name:value pair. example secondary attributes for vehicle would are color, make etc
    when a person is detected it will height, eye-color etc.
    */
    map<string, string> info = 5;

    /*
    object appearance vector or embedding
    */
    Embedding embedding = 6;

    /*
    object pose, primarily provides array of keypoints
        */
    Pose pose = 7;

    /*
    gaze, direction and angle of gaze
    */
    Gaze gaze = 8;

    /*
    lip activity like speaking, silent
    */
    LipActivity lipActivity = 9;


    /*
    speed of object
    */
    float speed = 10;

    /*
    in direction x, y, z
    */
    repeated float dir = 11;

    /*

    cartesian coordinate
    */
    Coordinate coordinate = 12;

    /*
    lat, lon, alt
    */
    Location location = 13;

}

/*
cartesian coordinates
*/
message Coordinate{
    double x = 1;
    double y = 2;
    double z = 3;

}

/*
geo location
*/
message Location{
    double lat = 1;
    double lon = 2;
    double alt = 3;
}
/*
top left corner + bottom right corner
*/
message Bbox {
    float leftX = 1;
    float  topY = 2;
    float rightX = 3;
    float bottomY = 4;
}

/*
The type represent the  nature of the pose,
i.e., pose2D, pose25D or pose3D,

which is followed by several Keypoints or bodyparts.
Each Keypoint comprises of

    -name,

    -coordinate (x,y,z, confidence)

    -quaternion (qx,qy,qz,qw), quaternion values are optional.

    - e.g., |left-eye|x,y,z,0.75|qx,qy,qz,qw|.

-For pose3D, the coordinates (x,y,z) are in the world coordinate system with
respect to the camera (unit: mm).

-For pose2.5D, it shares the same format as pose3D,
however, the coordinates (x,y) are in the image plane (unit: pixel) and the z coordinate
stands for the metric depth relative to the root keypoint, i.e., pelvis (unit: mm).

-For pose2D, (x,y) are the image pixel coordinates and the z coordinate is ignored, e.g., |right-ear,x,y,0.0.0.80|.

*/
message Pose{

    /*
    pose2D or pose3D or pose25D
    */
    string type = 1;

    /*
    array of keypoints
    */
    repeated Keypoint keypoints = 2;

    /*
    array of pose actons with their probabilities or confidence
    */
    repeated Action actions = 3;

    /*
    Keypoint + quaternion
    */
    message Keypoint{
        //name of keypoint
        string name = 1;

        // Array comprising of (x,y,z, confidence)
        repeated float coordinates = 2;

        //Array comprising of (qx,qy,qz,qw)
        repeated float quaternion = 3;


    }

/*
Pose Action, like walking, running
*/
message Action{
    /*
    type of action, standing, running etc
    */
    string type = 1;

    /*
    confidence of actions
    */
    float confidence = 2;
}
}

/*
Gaze point of reference  x,y,z are in the camera coordinate system.
theta, phi are angles
*/
message Gaze{

    float x = 1;
    float y = 2;
    float z = 3;
    float theta = 4;
    float phi = 5;


}

/*
activity like silent speaking
*/
message LipActivity{
    string classLabel = 1;
}

/*
event like moving, parked, etc
*/
message Event{
    /*
    id
    */
    string id = 1;
    /*
    type
    */
    string type = 2;

    /*
    optional attributes
    */
    map<string, string> info = 5;
}

/*
*
* Analytics Module to be nested in every message, example below
*
* {{{
*
*
* "analyticsModule": {
*                   "id": "module-id",
*                   "description": "Vehicle Detection and License Plate Recognition",
*                   "source": "OpenALR",
*                   "version": "3.0"
* }
*
*
* }}}
*
*/

message  AnalyticsModule{
    /*
    id
    */
    string id = 1;
    /*
    description
    */
    string description = 2;
    /*
    source
    */
    string source = 3;
    /*
    version
    */
    string version = 4;
    /*
    additional info
    */
    map<string, string> info = 5;

}


/*
*
* Sensor object to be nested in every message, example
*
* {{{
*
*
* "sensor": {
*           "id": "string",
*           "type": "Camera/Puck",
*           "location": {
*                     "lat": 45.99,
*                     "lon": 35.54,
*                     "alt": 79.03
*       },
*           "coordinate": {
*                      "x": 5.2,
*                     "y": 10.1,
*                     "z": 11.2
*      },
*      "description": "Entrance of Endeavor Garage Right Lane"
* }
*
*
* }}}
*
*/
message Sensor{
    /*
    id
    */
    string id = 1;
    /*
    Camera or Puck
    */
    string type = 2;

    /*
    description
    */
    string description=3;
    /*
    in lat, lon, alt
    */
    Location location = 4;
    /*
    in x,y,z
    */
    Coordinate coordinate = 5;
    /*
    additional info
    */
    map<string, string> info = 6;

}


/*
*
* Describing a scene needs describing where the scene is happening – place,
* what is happening in terms of events, and who are the objects participating in the event.
* note coordinate(x,y,z) are in meters
*
* JSON representation of Place
* {{{
*
*
        "place": {
            "id": "string",
            "name": "endeavor",
            "type": "building/garage",
            "location": {
                "lat": 37.37060687475246,
                "lon": -121.9672466762127,
                "alt": 0.00
            }
*
*
* }}}
*
*
*
*/
message Place{
    /*
    Id
    */
    string id = 1;
    /*
    name
    */
    string name = 2;
    /*
    Parking Lot or Entrance or Room
    */
    string type = 3;
    /*
    in lat, lon, alt
    */
    Location location = 4;
    /*
    in x,y,z
    */
    Coordinate coordinate = 5;
    /*
    additional info, example details of park lot
    */
    map<string, string> info = 6;

}

/*
represents a single object detection
*/
message Message{
    string messageid = 1;
    string mdsversion = 2;
    google.protobuf.Timestamp  timestamp = 3;
    Place place = 4;
    Sensor          sensor = 5;
    AnalyticsModule analyticsModule = 6;
    Object       object = 7;
    Event           event = 8;
    string videoPath = 9;
}

/*
Object Embedding
*/
message Embedding{

    /*
    object appearance vector
    */
    repeated float vector = 1 [packed = true];

    map<string, string> info = 2;


}

An example of json frame object which follows the above protobuf schema can be found below

    {
        "version": "4.0",
        "timestamp": "2023-10-30T22:30:30.000Z",
        "id": "1",
        "sensorId": "Sensor_1",
        "objects": [
            {
                "dir": [],
                "lipActivity": null,
                "info": {
                    "visibility": "0.974372",
                    "footLocation2D": "899.152,316.812",
                    "footLocation3D": "9.85312,-12.53092",
                    "convexHull": "10,-4,23,-19,7,20,-18,26"
                },
                "coordinate": null,
                "speed": 0,
                "bbox": {
                    "topY": 107.74882507324219,
                    "rightX": 1116.9134521484375,
                    "bottomY": 311.5032653808594,
                    "leftX": 1063.572021484375
                },
                "embedding": {
                    "info": {},
                    "vector": [
                        -0.003528113476932049,
                        -0.002565900795161724,
                        -0.020954856649041176,
                        0.023293567821383476
                    ]
                },
                "location": null,
                "gaze": null,
                "id": "1",
                "confidence": 0.9556514024734497,
                "pose": null,
                "type": "Person"
            },
            {
                "dir": [],
                "lipActivity": null,
                "info": {
                    "visibility": "0.991698",
                    "footLocation2D": "297.655,179.069",
                    "footLocation3D": "17.56687,20.29478",
                    "convexHull": "-19,-33,-18,-32,3,35"
                },
                "coordinate": null,
                "speed": 0,
                "bbox": {
                    "topY": 115.80743408203125,
                    "rightX": 861.8057861328125,
                    "bottomY": 328.8199768066406,
                    "leftX": 814.3302001953125
                },
                "embedding": {
                    "info": {},
                    "vector": [
                        0.0002149203501176089,
                        -0.023472599685192108,
                        -0.02617083303630352,
                        0.022965412586927414
                    ]
                },
                "location": null,
                "gaze": null,
                "id": "2",
                "confidence": 0.9136765003204346,
                "pose": null,
                "type": "Person"
            }
        ]
    }

ext.proto

syntax = "proto3";

package nv;

option java_package = "nv.schema";

import "google/protobuf/timestamp.proto";
import "schema.proto";



/*
represent enhanced  frame
*/
message FrameMessage {

    /*
    version
    */
    string version = 1;
    /*
    frame id
    */
    string id = 2;

    /*
    timestamp in UTC format
    */
    google.protobuf.Timestamp timestamp = 3;


    /*
    sensor id
    */
    string sensorId = 4;


    /*
    sequence of detected objects
    */
    repeated Object objects = 5;

    /*
    FOV based object count
    */
    repeated TypeCount fov = 6;

    /*
    ROI based object count
    */
    repeated TypeCount rois = 7;

    SD socialDistancing = 8;

    /*
    addition information, that may be needed by and application
    */
    map<string, string> info = 16;

}

/*
Object Type and corresponding count
*/
message TypeCount{
    /*
    roi or fov id
    */
    string id = 1;
    /*
    type of object
    */
    string type = 2;
    /*
    count of object
    */
    int32 count = 3;
}

/*
2D point, represented using x,y coordinates
*/
message Point2D{
    double x = 1;
    double y = 2;
}

/*
Cluster, array of 2D points
*/
message Cluster{
    repeated Point2D points = 1;
}

/*
social distancing
*/
message SD{
    /*
    cluster threshold
    */
    double threshold = 1;
    /*
    number of clusters violating social distancing
    */
    int32 proximityDetections = 2;
    /*
    2D point clusters
    */
    repeated Cluster clusters = 3;
}


/*
* represents locations array, for geo coodinates Array[Array[lon, lat]], for cartesian coordinates Array[Array[x,y]]
*/
message GeoLocation{
    string type = 1;

    /*
    Array(x,y)
    */
    message Point{
        repeated double point = 1;
    }
    /*
    Array of points
    */
    repeated Point coordinates = 2;

    //repeated google.protobuf.ListValue coordinates = 2;
}
/*
* represent object Behavior, and comprise of attributes from object movement + [[Object]] appearance
* with highest confidence + [[Sensor]] + [[Place]]
*  + [[Pose]] + [[LipActivity]] + [[Gaze]]
*
*
*
*
* the JSON representation of this object Behavior message  is stored in persistent store like Elasticsearch for search and indexing
*
*
*/
message Behavior{
    /*
    * object id, usually a combination of sensorId + objectId
    */
    string id = 1;

    /*
    * start timestamp
    */
    google.protobuf.Timestamp timestamp = 2;

    /*
    * end timestamp
    */
    google.protobuf.Timestamp end = 3;


    // start timestamp

    //google.protobuf.Timestamp timestamp = 4;

    /*
    * representing array of [lon,lat] or [x,y]
    */
    GeoLocation locations = 5;

    /*
    * representing smoothened array of [lon,lat] or [x,y]
    */
    GeoLocation smoothLocations = 6;

    /*
    * road network edges, only applicable for geo coordinates
    */
    repeated string edges = 7;

    /*
    * distance traveled in meters
    */
    double distance = 8;

    /*
    * avg speed in mph
    */
    double speed = 9;

    /*
    * speed over a period of the trajectory
    */
    repeated double speedOverTime = 10;

    /*
    * timeInterval, end time - start time
    */
    double timeInterval = 11;

    /*
    * bearing angle
    */
    double bearing = 12;

    /*
    * N, S, E or W based on bearing
    */
    string direction = 13;

    /*
    * number of raw coordinates
    */
    int32 length = 14;

    /*
    * where object was seen by the sensor
    */
    Place place = 15;

    /*
    * sensor details
    */
    Sensor sensor = 16;

    /*
    * AnalyticsModule
    */
    AnalyticsModule analyticsModule = 17;

    /*
    * object details, associated with this trajectory
    */
    Object object = 18;

    /*
    * event details
    */
    Event event = 19;

    /*
    * video url if available
    */
    string videoPath = 20;

    /*
    * Array of pose
    */
    repeated Pose poses = 21;

    /*
    * Array of LipActivity
    */
    repeated LipActivity lipActivities = 22;

    /*
    Array of Gaze
    */
    repeated Gaze gazes = 23;

    /*
    Array of
    */
    repeated Embedding embeddings = 24;

    /*
    addition information, that may be needed by and application
    */
    map<string, string> info = 25;


}

An example of json behavior object which follows the above protobuf schema can be found below

    {
        "analyticsModule": {
            "description": "",
            "info": {},
            "version": "",
            "id": "",
            "source": ""
        },
        "end": "2023-11-11T09:50:31.252Z",
        "object": {
            "bbox": {
                "bottomY": 144,
                "leftX": 1625,
                "topY": 72,
                "rightX": 1664
            },
            "location": {
                "lat": -0.0002441081085205078,
                "lon": -0.00026202797699034064,
                "alt": 0
            },
            "info": {
                "visibility": "0.991698",
                "footLocation2D": "297.655,179.069",
                "footLocation3D": "17.56687,20.29478",
                "convexHull": "-19,-33,-18,-32,3,35"
            },
            "coordinate": {
                "z": 0,
                "y": 27.123123168945312,
                "x": 29.114219665527344
            },
            "id": "1052",
            "confidence": 0.9485750198364258,
            "pose": null,
            "gaze": null,
            "type": "Person",
            "lipActivity": null,
            "speed": 0,
            "dir": [],
            "embedding": null
        },
        "timestamp": "2023-11-11T09:50:31.218Z",
        "id": "Sensor_1 #-# 1052",
        "edges": [],
        "timeInterval": 0.034,
        "bearing": 239.8590251375903,
        "poses": [],
        "place": {
            "location": null,
            "name": "building=Building_1",
            "type": "",
            "coordinate": null,
            "info": {},
            "id": ""
        },
        "locations": {
            "type": "linestring",
            "coordinates": [
                [
                    29.199949264526367,
                    27.270771026611328
                ],
                [
                    29.114219665527344,
                    27.123123168945312
                ]
            ]
        },
        "info": {},
        "distance": 0.1707321118533282,
        "videoPath": "frameId-10696",
        "direction": "Down",
        "gazes": [],
        "speed": 11.232845266702702,
        "event": null,
        "smoothLocations": {
            "type": "linestring",
            "coordinates": [
                [
                    29.199949264526367,
                    27.270771026611328
                ],
                [
                    29.114219665527344,
                    27.123123168945312
                ]
            ]
        },
        "length": 2,
        "sensor": {
            "description": "Sensor_1",
            "location": null,
            "type": "",
            "coordinate": null,
            "info": {},
            "id": "Sensor_1"
        },
        "lipActivities": [],
        "speedOverTime": [
            11.232845266702702
        ]
    }