gRPC directly with Audio2Face

In order to interact with Audio2Face directly you will need to create a client to send data and implement a server to receive the data.

Client side

This is the gRPC server prototype that you need to send the data to:

Service

nvidia_ace.services.a2f.v1.proto
syntax = "proto3";

package nvidia_ace.services.a2f.v1;

import "nvidia_ace.a2f.v1.proto";
import "nvidia_ace.status.v1.proto";

service A2FService {
  // RPC to implement to send audio data to Audio2Face Microservice
  // An example use for this RPC is a client pushing audio buffers to
  // Audio2Face Microservice (server)
  rpc PushAudioStream(stream nvidia_ace.a2f.v1.AudioStream)
      returns (nvidia_ace.status.v1.Status) {}
}
//nvidia_ace.services.a2f.v1
//v0.1.0

Protobuf data

nvidia_ace.a2f.v1.proto
syntax = "proto3";

package nvidia_ace.a2f.v1;


import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";


message AudioStream {
  // The header must be sent as the first message.
  // One or more audio with emotion messages must be sent thereafter.
  // The end of audio will happen when the client closes the connection
  oneof stream_part {
    AudioStreamHeader audio_stream_header = 1;
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
  // IDs of the current stream
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
  
  nvidia_ace.audio.v1.AudioHeader audio_header = 2;

  // Parameters for updating the facial characteristics of an avatar
  // See the documentation for more information
  FaceParameters face_params = 3;

  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes
  // See the documentation for more information
  EmotionPostProcessingParameters emotion_post_processing_params = 4;

  // Multipliers and offsets to apply to the generated blendshape values
  BlendShapeParameters blendshape_params = 5;
}

message FloatArray { repeated float values = 1; }

// For which parameters to set here, refer to the documentation
message FaceParameters {
  map<string, float> float_params = 1;
  map<string, int32> integer_params = 2;
  map<string, FloatArray> float_array_params = 3;
}

// For the Blendshape names, refer to the documentation
message BlendShapeParameters {
  map<string, float> bs_weight_multipliers = 1;
  map<string, float> bs_weight_offsets = 2;
}

// For more information refer to the documentation
message EmotionPostProcessingParameters {
  // Increases the spread between emotion values by pushing them higher or lower.
  // Default value: 1
  // Min: 0.3
  // Max: 3
  optional float emotion_contrast = 1;

  // Coefficient for smoothing emotions over time
  //  0 means no smoothing at all (can be jittery)
  //  1 means extreme smoothing (emotion values not updated over time)
  // Default value: 0.7
  // Min: 0
  // Max: 1
  optional float live_blend_coef = 2;

  // Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
  // Default: True
  optional bool enable_preferred_emotion = 3;

  // Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
  // 0 means only A2E output will be used for emotion rendering.
  // 1 means only the preferred emotions will be used for emotion rendering.
  // Default value: 0.5
  // Min: 0
  // Max: 1
  optional float preferred_emotion_strength = 4;

  // Sets the strength of generated emotions relative to neutral emotion.
  // This multiplier is applied globally after the mix of emotion is done.
  // If set to 0, emotion will be neutral.
  // If set to 1, the blend of emotion will be fully used. (can be too intense)
  // Default value: 0.6
  // Min: 0
  // Max: 1
  optional float emotion_strength = 5;

  // Sets a firm limit on the quantity of emotion sliders engaged by A2E
  // emotions with highest weight will be prioritized
  // Default value: 3
  // Min: 1
  // Max: 6
  optional int32 max_emotions = 6;
}

message AudioWithEmotion {
  // audio buffer in bytes to interpret depending on the audio header
  bytes audio_buffer = 1;

  // The time codes are relative to the beginning of the audio clip.
  repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v0.1.0

Server side

Service

Implementing PushAnimationDataStream as a server rpc will make it possible for you to receive data from A2F.

nvidia_ace.services.animation_data.v1.proto
syntax = "proto3";

package nvidia_ace.services.animation_data.v1;

import "nvidia_ace.animation_data.v1.proto";
import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";

// 2 RPC exist to provide a stream of animation data
// The RPC to implement depends on if the part of the service
// is a client or a server.
// E.g.: In the case of Animation Graph Microservice, we implement both RPCs.
// One to receive and one to send.
service AnimationDataService {
  // When the service creating the animation data is a client from the service receiving them
  // This push RPC must be used.
  // An example for that is Audio2Face Microservice creating animation data and sending them
  // to Animation Graph Microservice
  rpc PushAnimationDataStream(stream nvidia_ace.animation_data.v1.AnimationDataStream)
      returns (nvidia_ace.status.v1.Status) {}
  // When the service creating the animation data is a server from the service receiving them
  // This pull RPC must be used.
  // An example for that is the Omniverse Renderer Microservice requesting animation data to the
  // Animation Graph Microservice.
  rpc PullAnimationDataStream(nvidia_ace.animation_id.v1.AnimationIds)
      returns (stream nvidia_ace.animation_data.v1.AnimationDataStream) {}
}
//nvidia_ace.services.animation_data.v1
//v0.1.0

Protobuf data

nvidia_ace.a2f.v1.proto
syntax = "proto3";

package nvidia_ace.a2f.v1;


import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";


message AudioStream {
  // The header must be sent as the first message.
  // One or more audio with emotion messages must be sent thereafter.
  // The end of audio will happen when the client closes the connection
  oneof stream_part {
    AudioStreamHeader audio_stream_header = 1;
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
  // IDs of the current stream
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
  
  nvidia_ace.audio.v1.AudioHeader audio_header = 2;

  // Parameters for updating the facial characteristics of an avatar
  // See the documentation for more information
  FaceParameters face_params = 3;

  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes
  // See the documentation for more information
  EmotionPostProcessingParameters emotion_post_processing_params = 4;

  // Multipliers and offsets to apply to the generated blendshape values
  BlendShapeParameters blendshape_params = 5;
}

message FloatArray { repeated float values = 1; }

// For which parameters to set here, refer to the documentation
message FaceParameters {
  map<string, float> float_params = 1;
  map<string, int32> integer_params = 2;
  map<string, FloatArray> float_array_params = 3;
}

// For the Blendshape names, refer to the documentation
message BlendShapeParameters {
  map<string, float> bs_weight_multipliers = 1;
  map<string, float> bs_weight_offsets = 2;
}

// For more information refer to the documentation
message EmotionPostProcessingParameters {
  // Increases the spread between emotion values by pushing them higher or lower.
  // Default value: 1
  // Min: 0.3
  // Max: 3
  optional float emotion_contrast = 1;

  // Coefficient for smoothing emotions over time
  //  0 means no smoothing at all (can be jittery)
  //  1 means extreme smoothing (emotion values not updated over time)
  // Default value: 0.7
  // Min: 0
  // Max: 1
  optional float live_blend_coef = 2;

  // Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
  // Default: True
  optional bool enable_preferred_emotion = 3;

  // Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
  // 0 means only A2E output will be used for emotion rendering.
  // 1 means only the preferred emotions will be used for emotion rendering.
  // Default value: 0.5
  // Min: 0
  // Max: 1
  optional float preferred_emotion_strength = 4;

  // Sets the strength of generated emotions relative to neutral emotion.
  // This multiplier is applied globally after the mix of emotion is done.
  // If set to 0, emotion will be neutral.
  // If set to 1, the blend of emotion will be fully used. (can be too intense)
  // Default value: 0.6
  // Min: 0
  // Max: 1
  optional float emotion_strength = 5;

  // Sets a firm limit on the quantity of emotion sliders engaged by A2E
  // emotions with highest weight will be prioritized
  // Default value: 3
  // Min: 1
  // Max: 6
  optional int32 max_emotions = 6;
}

message AudioWithEmotion {
  // audio buffer in bytes to interpret depending on the audio header
  bytes audio_buffer = 1;

  // The time codes are relative to the beginning of the audio clip.
  repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v0.1.0