gRPC through A2F Controller
This page describes the process of interacting with the Audio2Face Controller as the front of a A2F Cluster.
Service definition
The A2F Controller interface is a bi-directional streaming service.
nvidia_ace.services.a2f_controller.v1.proto
syntax = "proto3";
package nvidia_ace.services.a2f_controller.v1;
import "nvidia_ace.controller.v1.proto";
import "nvidia_ace.animation_id.v1.proto";
import "google/protobuf/empty.proto";
service A2FControllerService {
// Will process a single audio clip and answer animation data
// in a burst.
rpc ProcessAudioStream(stream nvidia_ace.controller.v1.AudioStream)
returns (stream nvidia_ace.controller.v1.AnimationDataStream) {}
}
//nvidia_ace.services.a2f_controller.v1
//v0.1.0
The ProcessAudioStream rpc is the only call you need to do to generate animation data from audio input.
Service protobuf objects
nvidia_ace.controller.v1.proto
syntax = "proto3";
package nvidia_ace.controller.v1;
import "nvidia_ace.a2f.v1.proto";
import "nvidia_ace.animation_data.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.status.v1.proto";
import "google/protobuf/any.proto";
message AudioStream {
// This is a marker for the end of an audio clip.
message EndOfAudio {}
oneof stream_part {
// The header must be sent as the first message.
AudioStreamHeader audio_stream_header = 1;
// At least one AudioWithEmotion messages must be sent thereafter.
nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
// The EndOfAudio must be sent last.
EndOfAudio end_of_audio = 3;
}
}
// IMPORTANT NOTE: this is an AudioStreamHeader WITHOUT ID
// A similar AudioStreamHeader exist in nvidia_ace.a2f.v1.proto
// but that one does contain IDs.
message AudioStreamHeader {
// Metadata about the audio being sent to the service.
nvidia_ace.audio.v1.AudioHeader audio_header = 1;
// Parameters for updating the facial characteristics of an avatar.
// See the documentation for more information.
nvidia_ace.a2f.v1.FaceParameters face_params = 2;
// Parameters relative to the emotion blending and processing
// before using it to generate blendshapes.
// See the documentation for more information.
nvidia_ace.a2f.v1.EmotionPostProcessingParameters emotion_post_processing_params = 3;
// Multipliers and offsets to apply to the generated blendshape values.
nvidia_ace.a2f.v1.BlendShapeParameters blendshape_params = 4;
}
enum EventType {
// This event type means that the A2F Microservice is done processing audio,
// However it doesn't mean that you finished receiving all the audio data,
// You will receive a Status message once you are done receiving all the audio
// data. Events are independent of that.
END_OF_A2F_AUDIO_PROCESSING = 0;
}
message Event {
// Type of the event.
EventType event_type = 1;
// Data attached to the event if any.
optional google.protobuf.Any metadata = 2;
}
// IMPORTANT NOTE: this is an AnimationDataStreamHeader WITHOUT ID
// A similar AudioStreamHeader exist in nvidia_ace.animation_data.v1.proto
// but that one does contain IDs.
message AnimationDataStreamHeader {
// Metadata of the audio buffers. This defines the audio clip properties
// at the beginning the streaming process.
optional nvidia_ace.audio.v1.AudioHeader audio_header = 1;
// Metadata containing the blendshape and joints names.
// This defines the names of the blendshapes and joints flowing through a stream.
optional nvidia_ace.animation_data.v1.SkelAnimationHeader
skel_animation_header = 2;
// Time codes indicate the relative progression of an animation data, audio
// clip, etc. The unit is seconds. In addition, we also need an absolute time
// reference shared across services. The start time is stored in time codes
// elapsed since the Unix time epoch. start_time_code_since_epoch = `Unix
// timestamp in seconds`. NTP should be good enough to synchronize clocks
// across nodes. From Wikipedia: NTP can usually maintain time to within tens
// of milliseconds over the public Internet, and can achieve better than one
// millisecond accuracy in local area networks under ideal conditions.
// Alternatively, there is PTP.
double start_time_code_since_epoch = 3;
// A generic metadata field to attach use case specific data (e.g. session id,
// or user id?) map<string, string> metadata = 4; map<string,
// google.protobuf.Any> metadata = 4;
}
message AnimationDataStream {
// The header must be sent as the first message.
// One or more animation data message must be sent.
// The status must be sent last and may be sent in between.
oneof stream_part {
// The header must be sent as the first message.
AnimationDataStreamHeader animation_data_stream_header = 1;
// Then one or more animation data message must be sent.
nvidia_ace.animation_data.v1.AnimationData animation_data = 2;
// The event may be sent in between.
Event event = 3;
// The status must be sent last and may be sent in between.
nvidia_ace.status.v1.Status status = 4;
}
}
//nvidia_ace.controller.v1
//v0.1.0
Dependencies
nvidia_ace.a2f.v1.proto
syntax = "proto3";
package nvidia_ace.a2f.v1;
import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";
message AudioStream {
// The header must be sent as the first message.
// One or more audio with emotion messages must be sent thereafter.
// The end of audio will happen when the client closes the connection
oneof stream_part {
AudioStreamHeader audio_stream_header = 1;
nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
}
}
// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
// IDs of the current stream
nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
nvidia_ace.audio.v1.AudioHeader audio_header = 2;
// Parameters for updating the facial characteristics of an avatar
// See the documentation for more information
FaceParameters face_params = 3;
// Parameters relative to the emotion blending and processing
// before using it to generate blendshapes
// See the documentation for more information
EmotionPostProcessingParameters emotion_post_processing_params = 4;
// Multipliers and offsets to apply to the generated blendshape values
BlendShapeParameters blendshape_params = 5;
}
message FloatArray { repeated float values = 1; }
// For which parameters to set here, refer to the documentation
message FaceParameters {
map<string, float> float_params = 1;
map<string, int32> integer_params = 2;
map<string, FloatArray> float_array_params = 3;
}
// For the Blendshape names, refer to the documentation
message BlendShapeParameters {
map<string, float> bs_weight_multipliers = 1;
map<string, float> bs_weight_offsets = 2;
}
// For more information refer to the documentation
message EmotionPostProcessingParameters {
// Increases the spread between emotion values by pushing them higher or lower.
// Default value: 1
// Min: 0.3
// Max: 3
optional float emotion_contrast = 1;
// Coefficient for smoothing emotions over time
// 0 means no smoothing at all (can be jittery)
// 1 means extreme smoothing (emotion values not updated over time)
// Default value: 0.7
// Min: 0
// Max: 1
optional float live_blend_coef = 2;
// Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
// Default: True
optional bool enable_preferred_emotion = 3;
// Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
// 0 means only A2E output will be used for emotion rendering.
// 1 means only the preferred emotions will be used for emotion rendering.
// Default value: 0.5
// Min: 0
// Max: 1
optional float preferred_emotion_strength = 4;
// Sets the strength of generated emotions relative to neutral emotion.
// This multiplier is applied globally after the mix of emotion is done.
// If set to 0, emotion will be neutral.
// If set to 1, the blend of emotion will be fully used. (can be too intense)
// Default value: 0.6
// Min: 0
// Max: 1
optional float emotion_strength = 5;
// Sets a firm limit on the quantity of emotion sliders engaged by A2E
// emotions with highest weight will be prioritized
// Default value: 3
// Min: 1
// Max: 6
optional int32 max_emotions = 6;
}
message AudioWithEmotion {
// audio buffer in bytes to interpret depending on the audio header
bytes audio_buffer = 1;
// The time codes are relative to the beginning of the audio clip.
repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v0.1.0
nvidia_ace.emotion_with_timecode.v1.proto
syntax = "proto3";
package nvidia_ace.emotion_with_timecode.v1;
// Emotions with time code allow clients to control when emotions are
// being applied to an audio clip
// Example 1:
// time_code = 0.0
// emotion = { "joy" : 1.0 }
// At the start of the audio clip, the joy emotion will be applied
// at its maximum intensity.
// Example 2:
// time_code = 3.0
// emotion = { "outofbreath" : 0.5 }
// At the 3-second mark in the audio clip, the outofbreath emotion
// will be applied at half intensity.
message EmotionWithTimeCode {
// Time when to apply the selected emotion
// This time is relative to the beginning of the audio clip
double time_code = 1;
// This maps the emotion names to the corresponding emotion strength
// Missing emotion values will be set to 0.0
// Refer to the documentation to see what emotion are available
map<string, float> emotion = 2;
}
//nvidia_ace.emotion_with_timecode.v1
//v0.1.0
nvidia_ace.audio.v1.proto
syntax = "proto3";
package nvidia_ace.audio.v1;
message AudioHeader {
enum AudioFormat { AUDIO_FORMAT_PCM = 0; }
// Example value: AUDIO_FORMAT_PCM
AudioFormat audio_format = 1;
// Currently only mono sound must be supported.
// Example value: 1
uint32 channel_count = 2;
// Defines the sample rate of the provided audio data
// Example value: 16000
uint32 samples_per_second = 3;
// Currently only 16 bits per sample must be supported.
// Example value: 16
uint32 bits_per_sample = 4;
}
//nvidia_ace.audio.v1
//v0.1.0
nvidia_ace.animation_data.v1.proto
syntax = "proto3";
package nvidia_ace.animation_data.v1;
import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.status.v1.proto";
import "google/protobuf/any.proto";
// IMPORTANT NOTE: this is an AnimationDataStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AnimationDataStreamHeader {
nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
// This is required to identify from which animation source (e.g. A2F) the
// request originates. This allows us to map the incoming animation data
// stream to the correct pose provider animation graph node. The animation
// source MSs (e.g. A2F MS) should populate this with their name. (e.g. A2F).
// Example Value: "A2F MS"
optional string source_service_id = 2;
// Metadata of the audio buffers. This defines the audio clip properties
// at the beginning the streaming process.
optional nvidia_ace.audio.v1.AudioHeader audio_header = 3;
// Metadata containing the blendshape and joints names.
// This defines the names of the blendshapes and joints flowing though a stream.
optional nvidia_ace.animation_data.v1.SkelAnimationHeader
skel_animation_header = 4;
// Time codes indicate the relative progression of an animation data, audio
// clip, etc. The unit is seconds. In addition, we also need an absolute time
// reference shared across services. The start time is stored in time codes
// elapsed since the Unix time epoch. start_time_code_since_epoch = `Unix
// timestamp in seconds`. NTP should be good enough to synchronize clocks
// across nodes. From Wikipedia: NTP can usually maintain time to within tens
// of milliseconds over the public Internet, and can achieve better than one
// millisecond accuracy in local area networks under ideal conditions.
// Alternatively, there is PTP.
double start_time_code_since_epoch = 5;
// A generic metadata field to attach use case specific data (e.g. session id,
// or user id?) map<string, string> metadata = 6; map<string,
// google.protobuf.Any> metadata = 6;
}
// This message represent each message of a stream of animation data.
message AnimationDataStream {
oneof stream_part {
// The header must be sent as the first message.
AnimationDataStreamHeader animation_data_stream_header = 1;
// Then one or more animation data message must be sent.
nvidia_ace.animation_data.v1.AnimationData animation_data = 2;
// The status must be sent last and may be sent in between.
nvidia_ace.status.v1.Status status = 3;
}
}
message AnimationData {
optional SkelAnimation skel_animation = 1;
optional AudioWithTimeCode audio = 2;
optional Camera camera = 3;
// Metadata such as emotion aggregates, etc...
map<string, google.protobuf.Any> metadata = 4;
}
message AudioWithTimeCode {
// The time code is relative to the `start_time_code_since_epoch`.
// Example Value: 0.0 (for the very first audio buffer flowing out of a service)
double time_code = 1;
// Audio Data in bytes, for how to interpret these bytes you need to refer to
// the audio header.
bytes audio_buffer = 2;
}
message SkelAnimationHeader {
// Names of the blendshapes only sent once in the header
// The position of these names is the same as the position of the values
// of the blendshapes messages
// As an example if the blendshape names are ["Eye Left", "Eye Right", "Jaw"]
// Then when receiving blendshape data over the streaming process
// E.g.: [0.1, 0.5, 0.2] & timecode = 0.0
// The pairing will be for timecode=0.0, "Eye Left"=0.1, "Eye Right"=0.5, "Jaw"=0.2
repeated string blend_shapes = 1;
// Names of the joints only sent once in the header
repeated string joints = 2;
}
message SkelAnimation {
// Time codes must be strictly monotonically increasing.
// Two successive SkelAnimation messages must not have overlapping time code
// ranges.
repeated FloatArrayWithTimeCode blend_shape_weights = 1;
repeated Float3ArrayWithTimeCode translations = 2;
repeated QuatFArrayWithTimeCode rotations = 3;
repeated Float3ArrayWithTimeCode scales = 4;
}
message Camera {
repeated Float3WithTimeCode position = 1;
repeated QuatFWithTimeCode rotation = 2;
repeated FloatWithTimeCode focal_length = 3;
repeated FloatWithTimeCode focus_distance = 4;
}
message FloatArrayWithTimeCode {
double time_code = 1;
repeated float values = 2;
}
message Float3ArrayWithTimeCode {
double time_code = 1;
repeated Float3 values = 2;
}
message QuatFArrayWithTimeCode {
double time_code = 1;
repeated QuatF values = 2;
}
message Float3WithTimeCode {
double time_code = 1;
Float3 value = 2;
}
message QuatFWithTimeCode {
double time_code = 1;
QuatF value = 2;
}
message FloatWithTimeCode {
double time_code = 1;
float value = 2;
}
message QuatF {
float real = 1;
float i = 2;
float j = 3;
float k = 4;
}
message Float3 {
float x = 1;
float y = 2;
float z = 3;
}
//nvidia_ace.animation_data.v1
//v0.1.0
nvidia_ace.status.v1.proto
syntax = "proto3";
package nvidia_ace.status.v1;
// This status message indicates the result of an operation
// Refer to the rpc using it for more information
message Status {
enum Code {
SUCCESS = 0;
INFO = 1;
WARNING = 2;
ERROR = 3;
}
// Type of message returned by the service
// Example value: SUCCESS
Code code = 1;
// Message returned by the service
// Example value: "Audio processing completed successfully!"
string message = 2;
}
//nvidia_ace.status.v1
//v0.1.0