Skip to content

Fix excessive token usage with Unicode text in realtime event serialization #2444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/openai/resources/beta/realtime/realtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ async def send(self, event: RealtimeClientEvent | RealtimeClientEventParam) -> N
data = (
event.to_json(use_api_names=True, exclude_defaults=True, exclude_unset=True)
if isinstance(event, BaseModel)
else json.dumps(await async_maybe_transform(event, RealtimeClientEventParam))
else json.dumps(await async_maybe_transform(event, RealtimeClientEventParam), ensure_ascii=False)
)
await self._connection.send(data)

Expand Down Expand Up @@ -467,7 +467,7 @@ def send(self, event: RealtimeClientEvent | RealtimeClientEventParam) -> None:
data = (
event.to_json(use_api_names=True, exclude_defaults=True, exclude_unset=True)
if isinstance(event, BaseModel)
else json.dumps(maybe_transform(event, RealtimeClientEventParam))
else json.dumps(maybe_transform(event, RealtimeClientEventParam), ensure_ascii=False)
)
self._connection.send(data)

Expand Down
173 changes: 173 additions & 0 deletions tests/api_resources/beta/realtime/test_unicode_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""Test that realtime event serialization properly handles Unicode characters."""

import json
from typing import Any, Dict

import pytest

from openai.types.beta.realtime.realtime_client_event_param import RealtimeClientEventParam


class TestUnicodeSerializationFix:
"""Test that Unicode characters in realtime events are not unnecessarily escaped."""

def test_cyrillic_text_serialization(self) -> None:
"""Test that Cyrillic text in event data is serialized without Unicode escaping."""
# Sample event with Cyrillic text (simulating function calling with Russian descriptions)
event_data: RealtimeClientEventParam = {
"type": "response.create",
"response": {
"modalities": ["text"],
"instructions": "Ответьте на русском языке",
"tools": [
{
"type": "function",
"name": "get_user_info",
"description": "Получить информацию о пользователе",
"parameters": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Имя пользователя"
},
"age": {
"type": "integer",
"description": "Возраст пользователя"
}
},
"required": ["name"]
}
}
]
}
}

# Test the JSON serialization behavior
serialized_with_escapes = json.dumps(event_data) # Default ensure_ascii=True
serialized_without_escapes = json.dumps(event_data, ensure_ascii=False)

# Verify the fix: ensure_ascii=False should be used to avoid token bloat
assert len(serialized_without_escapes) < len(serialized_with_escapes), (
"Serialization with ensure_ascii=False should be more compact"
)

# Verify no Unicode escapes in the fixed version
assert "\\u" not in serialized_without_escapes, (
"Fixed serialization should not contain Unicode escape sequences"
)

# Verify original Cyrillic text is preserved
assert "Имя пользователя" in serialized_without_escapes, (
"Original Cyrillic text should be preserved in fixed serialization"
)

# Verify both versions parse to the same data
assert json.loads(serialized_with_escapes) == json.loads(serialized_without_escapes), (
"Both serialization methods should produce equivalent JSON when parsed"
)

def test_unicode_token_savings(self) -> None:
"""Test that Unicode text serialization provides significant token savings."""
# Event with substantial Unicode content
event_with_unicode = {
"type": "session.update",
"session": {
"instructions": (
"Вы помощник, который говорит на русском языке. "
"Отвечайте вежливо и информативно на все вопросы пользователей. "
"Используйте правильную грамматику и орфографию."
),
"voice": "alloy",
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 200
}
}
}

# Compare serialization sizes
with_escapes = json.dumps(event_with_unicode)
without_escapes = json.dumps(event_with_unicode, ensure_ascii=False)

# Calculate savings
size_reduction = len(with_escapes) - len(without_escapes)
percentage_saved = (size_reduction / len(with_escapes)) * 100

# Should provide substantial savings for Unicode-heavy content
assert percentage_saved > 30, (
f"Expected >30% size reduction, got {percentage_saved:.1f}%"
)

# Estimated token savings (rough estimate: 4 chars per token)
estimated_token_savings = size_reduction / 4
assert estimated_token_savings > 20, (
f"Expected >20 tokens saved, estimated {estimated_token_savings:.0f}"
)

def test_mixed_content_serialization(self) -> None:
"""Test serialization with mixed ASCII and Unicode content."""
mixed_content_event = {
"type": "conversation.item.create",
"item": {
"type": "message",
"role": "user",
"content": [
{
"type": "text",
"text": "Hello! Can you help me? Привет! Помогите мне, пожалуйста!"
}
]
}
}

serialized = json.dumps(mixed_content_event, ensure_ascii=False)

# Verify both languages are preserved correctly
assert "Hello! Can you help me?" in serialized
assert "Привет! Помогите мне, пожалуйста!" in serialized
assert "\\u" not in serialized # No Unicode escapes

def test_ascii_only_content_unchanged(self) -> None:
"""Test that ASCII-only content behaves the same with both settings."""
ascii_only_event = {
"type": "response.create",
"response": {
"modalities": ["text", "audio"],
"instructions": "Please respond in English only.",
"voice": "alloy"
}
}

with_escapes = json.dumps(ascii_only_event)
without_escapes = json.dumps(ascii_only_event, ensure_ascii=False)

# For ASCII-only content, both should be identical
assert with_escapes == without_escapes, (
"ASCII-only content should be identical with both ensure_ascii settings"
)

@pytest.mark.parametrize("language_text,expected_text", [
("中文测试", "中文测试"), # Chinese
("العربية", "العربية"), # Arabic
("עברית", "עברית"), # Hebrew
("日本語", "日本語"), # Japanese
("한국어", "한국어"), # Korean
("हिन्दी", "हिन्दी"), # Hindi
])
def test_various_unicode_scripts(self, language_text: str, expected_text: str) -> None:
"""Test that various Unicode scripts are handled correctly."""
event = {
"type": "session.update",
"session": {
"instructions": f"Respond in this language: {language_text}"
}
}

serialized = json.dumps(event, ensure_ascii=False)

# Verify the original text is preserved
assert expected_text in serialized
assert "\\u" not in serialized # No Unicode escapes