feat: add custom prompt config in the request and return prompt in th… · googleapis/googleapis@1a14153

@@ -779,19 +779,20 @@ message ExplicitDecodingConfig {

779779780780

// Configuration to enable speaker diarization.

781781

message SpeakerDiarizationConfig {

782-

// Required. Minimum number of speakers in the conversation. This range gives

783-

// you more flexibility by allowing the system to automatically determine the

784-

// correct number of speakers.

785-

//

786-

// To fix the number of speakers detected in the audio, set

787-

// `min_speaker_count` = `max_speaker_count`.

788-

int32 min_speaker_count = 2 [(google.api.field_behavior) = REQUIRED];

789-790-

// Required. Maximum number of speakers in the conversation. Valid values are:

791-

// 1-6. Must be >= `min_speaker_count`. This range gives you more flexibility

792-

// by allowing the system to automatically determine the correct number of

793-

// speakers.

794-

int32 max_speaker_count = 3 [(google.api.field_behavior) = REQUIRED];

782+

// Optional. The system automatically determines the number of speakers. This

783+

// value is not currently used.

784+

int32 min_speaker_count = 2 [(google.api.field_behavior) = OPTIONAL];

785+786+

// Optional. The system automatically determines the number of speakers. This

787+

// value is not currently used.

788+

int32 max_speaker_count = 3 [(google.api.field_behavior) = OPTIONAL];

789+

}

790+791+

// Configuration to enable custom prompt in chirp3.

792+

message CustomPromptConfig {

793+

// Optional. The custom instructions to override the existing instructions for

794+

// chirp3.

795+

string custom_prompt = 1 [(google.api.field_behavior) = OPTIONAL];

795796

}

796797797798

// Available recognition features.

@@ -846,21 +847,19 @@ message RecognitionFeatures {

846847

// Mode for recognizing multi-channel audio.

847848

MultiChannelMode multi_channel_mode = 17;

848849849-

// Configuration to enable speaker diarization and set additional

850-

// parameters to make diarization better suited for your application.

851-

// When this is enabled, we send all the words from the beginning of the

852-

// audio for the top alternative in every consecutive STREAMING responses.

853-

// This is done in order to improve our speaker tags as our models learn to

854-

// identify the speakers in the conversation over time.

855-

// For non-streaming requests, the diarization results will be provided only

856-

// in the top alternative of the FINAL SpeechRecognitionResult.

850+

// Configuration to enable speaker diarization. To enable diarization, set

851+

// this field to an empty SpeakerDiarizationConfig message.

857852

SpeakerDiarizationConfig diarization_config = 9;

858853859854

// Maximum number of recognition hypotheses to be returned.

860855

// The server may return fewer than `max_alternatives`.

861856

// Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of

862857

// one. If omitted, will return a maximum of one.

863858

int32 max_alternatives = 16;

859+860+

// Optional. Configuration to enable custom prompt for chirp3.

861+

CustomPromptConfig custom_prompt_config = 18

862+

[(google.api.field_behavior) = OPTIONAL];

864863

}

865864866865

// Transcription normalization configuration. Use transcription normalization

@@ -1066,6 +1065,13 @@ message RecognitionResponseMetadata {

1066106510671066

// When available, billed audio seconds for the corresponding request.

10681067

google.protobuf.Duration total_billed_duration = 6;

1068+1069+

// Optional. Output only. Provides the prompt used for the recognition

1070+

// request.

1071+

optional string prompt = 10 [

1072+

(google.api.field_behavior) = OUTPUT_ONLY,

1073+

(google.api.field_behavior) = OPTIONAL

1074+

];

10691075

}

1070107610711077

// Alternative hypotheses (a.k.a. n-best list).