1
1
from __future__ import annotations
2
2
3
- import base64
4
3
import concurrent .futures
5
4
import functools
6
5
import json
@@ -987,6 +986,7 @@ def __init__(
987
986
encoding : Optional [types .AudioEncoding ] = None ,
988
987
token : Optional [str ] = None ,
989
988
client : _client .Client ,
989
+ end_utterance_silence_threshold : Optional [int ],
990
990
) -> None :
991
991
self ._client = client
992
992
self ._websocket : Optional [websockets .sync .client .ClientConnection ] = None
@@ -999,8 +999,9 @@ def __init__(
999
999
self ._word_boost = word_boost
1000
1000
self ._encoding = encoding
1001
1001
self ._token = token
1002
+ self ._end_utterance_silence_threshold = end_utterance_silence_threshold
1002
1003
1003
- self ._write_queue : queue .Queue [bytes ] = queue .Queue ()
1004
+ self ._write_queue : queue .Queue [Union [ bytes , Dict ] ] = queue .Queue ()
1004
1005
self ._write_thread = threading .Thread (target = self ._write )
1005
1006
self ._read_thread = threading .Thread (target = self ._read )
1006
1007
self ._stop_event = threading .Event ()
@@ -1048,13 +1049,40 @@ def connect(
1048
1049
self ._read_thread .start ()
1049
1050
self ._write_thread .start ()
1050
1051
1052
+ if self ._end_utterance_silence_threshold is not None :
1053
+ self .configure_end_utterance_silence_threshold (
1054
+ self ._end_utterance_silence_threshold
1055
+ )
1056
+
1051
1057
def stream (self , data : bytes ) -> None :
1052
1058
"""
1053
1059
Streams audio data to the real-time service by putting it into a queue.
1054
1060
"""
1055
1061
1056
1062
self ._write_queue .put (data )
1057
1063
1064
+ def configure_end_utterance_silence_threshold (
1065
+ self , threshold_milliseconds : int
1066
+ ) -> None :
1067
+ """
1068
+ Configures the end of utterance silence threshold.
1069
+ Can be called multiple times during a session at any point after the session starts.
1070
+
1071
+ Args:
1072
+ `threshold_milliseconds`: The threshold in milliseconds.
1073
+ """
1074
+
1075
+ self ._write_queue .put (
1076
+ _RealtimeEndUtteranceSilenceThreshold (threshold_milliseconds ).as_dict ()
1077
+ )
1078
+
1079
+ def force_end_utterance (self ) -> None :
1080
+ """
1081
+ Forces the end of the current utterance.
1082
+ """
1083
+
1084
+ self ._write_queue .put (_RealtimeForceEndUtterance ().as_dict ())
1085
+
1058
1086
def close (self , terminate : bool = False ) -> None :
1059
1087
"""
1060
1088
Closes the connection to the real-time service gracefully.
@@ -1116,25 +1144,12 @@ def _write(self) -> None:
1116
1144
if isinstance (data , dict ):
1117
1145
self ._websocket .send (json .dumps (data ))
1118
1146
elif isinstance (data , bytes ):
1119
- self ._websocket .send (self . _encode_data ( data ) )
1147
+ self ._websocket .send (data )
1120
1148
else :
1121
1149
raise ValueError ("unsupported message type" )
1122
1150
except websockets .exceptions .ConnectionClosed as exc :
1123
1151
return self ._handle_error (exc )
1124
1152
1125
- def _encode_data (self , data : bytes ) -> str :
1126
- """
1127
- Encodes the given audio chunk as a base64 string.
1128
-
1129
- This is a helper method for `_write`.
1130
- """
1131
-
1132
- return json .dumps (
1133
- {
1134
- "audio_data" : base64 .b64encode (data ).decode ("utf-8" ),
1135
- }
1136
- )
1137
-
1138
1153
def _handle_message (
1139
1154
self ,
1140
1155
message : Dict [str , Any ],
@@ -1208,6 +1223,25 @@ def create_temporary_token(
1208
1223
)
1209
1224
1210
1225
1226
+ class _RealtimeForceEndUtterance :
1227
+ def as_dict (self ) -> Dict [str , bool ]:
1228
+ return {
1229
+ "force_end_utterance" : True ,
1230
+ }
1231
+
1232
+
1233
+ class _RealtimeEndUtteranceSilenceThreshold :
1234
+ def __init__ (self , threshold_milliseconds : int ) -> None :
1235
+ self ._value = threshold_milliseconds
1236
+
1237
+ @property
1238
+ def value (self ) -> int :
1239
+ return self ._value
1240
+
1241
+ def as_dict (self ) -> Dict [str , int ]:
1242
+ return {"end_utterance_silence_threshold" : self ._value }
1243
+
1244
+
1211
1245
class RealtimeTranscriber :
1212
1246
def __init__ (
1213
1247
self ,
@@ -1221,6 +1255,7 @@ def __init__(
1221
1255
encoding : Optional [types .AudioEncoding ] = None ,
1222
1256
token : Optional [str ] = None ,
1223
1257
client : Optional [_client .Client ] = None ,
1258
+ end_utterance_silence_threshold : Optional [int ] = None ,
1224
1259
) -> None :
1225
1260
"""
1226
1261
Creates a new real-time transcriber.
@@ -1235,6 +1270,7 @@ def __init__(
1235
1270
`encoding`: (Optional) The encoding of the audio data.
1236
1271
`token`: (Optional) A temporary authentication token.
1237
1272
`client`: (Optional) The client to use for the real-time service.
1273
+ `end_utterance_silence_threshold`: (Optional) The end utterance silence threshold in milliseconds.
1238
1274
"""
1239
1275
1240
1276
self ._client = client or _client .Client .get_default (
@@ -1251,6 +1287,7 @@ def __init__(
1251
1287
encoding = encoding ,
1252
1288
token = token ,
1253
1289
client = self ._client ,
1290
+ end_utterance_silence_threshold = end_utterance_silence_threshold ,
1254
1291
)
1255
1292
1256
1293
def connect (
@@ -1268,8 +1305,7 @@ def connect(
1268
1305
self ._impl .connect (timeout = timeout )
1269
1306
1270
1307
def stream (
1271
- self ,
1272
- data : Union [bytes , Generator [bytes , None , None ], Iterable [bytes ]],
1308
+ self , data : Union [bytes , Generator [bytes , None , None ], Iterable [bytes ]]
1273
1309
) -> None :
1274
1310
"""
1275
1311
Streams raw audio data to the real-time service.
@@ -1286,6 +1322,26 @@ def stream(
1286
1322
for chunk in data :
1287
1323
self ._impl .stream (chunk )
1288
1324
1325
+ def configure_end_utterance_silence_threshold (
1326
+ self , threshold_milliseconds : int
1327
+ ) -> None :
1328
+ """
1329
+ Configures the silence duration threshold used to detect the end of an utterance.
1330
+ In practice, it's used to tune how the transcriptions are split into final transcripts.
1331
+ Can be called multiple times during a session at any point after the session starts.
1332
+
1333
+ Args:
1334
+ `threshold_milliseconds`: The threshold in milliseconds.
1335
+ """
1336
+ self ._impl .configure_end_utterance_silence_threshold (threshold_milliseconds )
1337
+
1338
+ def force_end_utterance (self ) -> None :
1339
+ """
1340
+ Forces the end of the current utterance.
1341
+ After calling this method, the server will end the current utterance and return a final transcript.
1342
+ """
1343
+ self ._impl .force_end_utterance ()
1344
+
1289
1345
def close (self ) -> None :
1290
1346
"""
1291
1347
Closes the connection to the real-time service.
0 commit comments