Skip to content

Commit 1e227e6

Browse files
dmccrystals0h3yl
andcommitted
tests: add redact_pii tests
Co-authored-by: Soheyl <[email protected]> GitOrigin-RevId: 7d51651f62ee45ff08597b9c8dd9f55cfc9d8e57
1 parent a82f0c6 commit 1e227e6

File tree

4 files changed

+199
-8
lines changed

4 files changed

+199
-8
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ for result in result:
240240
import assemblyai as aai
241241

242242
config = aai.TranscriptionConfig()
243-
config.set_pii_redact(
243+
config.set_redact_pii(
244244
# What should be redacted
245245
policies=[
246246
aai.PIIRedactionPolicy.credit_card_number,
@@ -257,6 +257,8 @@ transcriber = aai.Transcriber()
257257
transcript = transcriber.transcribe("https://example.org/audio.mp3", config)
258258
```
259259

260+
[Read more about PII redaction here.](https://www.assemblyai.com/docs/Models/pii_redaction)
261+
260262
</details>
261263
<details>
262264
<summary>Summarize the content of a transcript over time</summary>

assemblyai/types.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def __init__(
409409
filter_profanity: Optional[bool] = None,
410410
redact_pii: Optional[bool] = None,
411411
redact_pii_audio: Optional[bool] = None,
412-
redact_pii_policies: Optional[PIIRedactionPolicy] = None,
412+
redact_pii_policies: Optional[List[PIIRedactionPolicy]] = None,
413413
redact_pii_sub: Optional[PIISubstitutionPolicy] = None,
414414
speaker_labels: Optional[bool] = None,
415415
speakers_expected: Optional[int] = None,
@@ -443,7 +443,7 @@ def __init__(
443443
boost_param: The weight to apply to words/phrases in the word_boost array.
444444
filter_profanity: Filter profanity from the transcribed text.
445445
redact_pii: Redact PII from the transcribed text.
446-
redact_pii_audio: Generate a copy of the original media file with spoken PII 'beeped' out.
446+
redact_pii_audio: Generate a copy of the original media file with spoken PII 'beeped' out (new audio only available for 24 hours).
447447
redact_pii_policies: The list of PII Redaction policies to enable.
448448
redact_pii_sub: The replacement logic for detected PII.
449449
speaker_labels: Enable Speaker Diarization.
@@ -956,17 +956,17 @@ def set_word_boost(
956956

957957
def set_redact_pii(
958958
self,
959-
enable: bool = True,
960-
redact_audio: bool = False,
961-
policies: List[PIIRedactionPolicy] = [],
962-
substitution: PIISubstitutionPolicy = PIISubstitutionPolicy.hash,
959+
enable: Optional[bool] = True,
960+
redact_audio: Optional[bool] = None,
961+
policies: Optional[List[PIIRedactionPolicy]] = None,
962+
substitution: Optional[PIISubstitutionPolicy] = None,
963963
) -> Self:
964964
"""
965965
Enables Personal Identifiable Information (PII) Redaction feature.
966966
967967
Args:
968968
enable: whether to enable or disable the PII Redaction feature.
969-
redact_audio: Generate a copy of the original media file with spoken PII 'beeped' out.
969+
redact_audio: Generate a copy of the original media file with spoken PII 'beeped' out. NOTE: The copy is available for 24h
970970
policies: A list of PII redaction policies to enable.
971971
substitution: The replacement logic for detected PII (`PIISubstutionPolicy.hash` by default).
972972
"""
@@ -979,6 +979,9 @@ def set_redact_pii(
979979

980980
return self
981981

982+
if not policies:
983+
raise ValueError("You must provide at least one PII redaction policy.")
984+
982985
self._raw_transcription_config.redact_pii = True
983986
self._raw_transcription_config.redact_pii_audio = redact_audio
984987
self._raw_transcription_config.redact_pii_policies = policies

tests/unit/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ py_test(
3131
"test_entity_detection.py",
3232
"test_iab_categories.py",
3333
"test_lemur.py",
34+
"test_redact_pii.py",
3435
"test_sentiment_analysis.py",
3536
"test_summarization.py",
3637
"test_transcriber.py",

tests/unit/test_redact_pii.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import json
2+
from typing import Any, Dict, Tuple
3+
4+
import factory
5+
import httpx
6+
import pytest
7+
from pytest_httpx import HTTPXMock
8+
9+
import assemblyai as aai
10+
from tests.unit import factories
11+
12+
aai.settings.api_key = "test"
13+
14+
15+
def __submit_mock_request(
16+
httpx_mock: HTTPXMock,
17+
mock_response: Dict[str, Any],
18+
config: aai.TranscriptionConfig,
19+
) -> Tuple[Dict[str, Any], aai.Transcript]:
20+
"""
21+
Helper function to abstract mock transcriber calls with given `TranscriptionConfig`,
22+
and perform some common assertions.
23+
"""
24+
25+
mock_transcript_id = mock_response.get("id", "mock_id")
26+
27+
# Mock initial submission response (transcript is processing)
28+
mock_processing_response = factories.generate_dict_factory(
29+
factories.TranscriptProcessingResponseFactory
30+
)()
31+
32+
httpx_mock.add_response(
33+
url=f"{aai.settings.base_url}/transcript",
34+
status_code=httpx.codes.OK,
35+
method="POST",
36+
json={
37+
**mock_processing_response,
38+
"id": mock_transcript_id, # inject ID from main mock response
39+
},
40+
)
41+
42+
# Mock polling-for-completeness response, with completed transcript
43+
httpx_mock.add_response(
44+
url=f"{aai.settings.base_url}/transcript/{mock_transcript_id}",
45+
status_code=httpx.codes.OK,
46+
method="GET",
47+
json=mock_response,
48+
)
49+
50+
# == Make API request via SDK ==
51+
transcript = aai.Transcriber().transcribe(
52+
data="https://example.org/audio.wav",
53+
config=config,
54+
)
55+
56+
# Check that submission and polling requests were made
57+
assert len(httpx_mock.get_requests()) == 2
58+
59+
# Extract body of initial submission request
60+
request = httpx_mock.get_requests()[0]
61+
request_body = json.loads(request.content.decode())
62+
63+
return request_body, transcript
64+
65+
66+
def test_redact_pii_disabled_by_default(httpx_mock: HTTPXMock):
67+
"""
68+
Tests that excluding `redact_pii` from the `TranscriptionConfig` will
69+
result in the default behavior of it being excluded from the request body
70+
"""
71+
request_body, transcript = __submit_mock_request(
72+
httpx_mock,
73+
mock_response=factories.generate_dict_factory(
74+
factories.TranscriptCompletedResponseFactory
75+
)(),
76+
config=aai.TranscriptionConfig(),
77+
)
78+
assert request_body.get("redact_pii") is None
79+
assert request_body.get("redact_pii_audio") is None
80+
assert request_body.get("redact_pii_policies") is None
81+
assert request_body.get("redact_pii_sub") is None
82+
83+
84+
def test_redact_pii_enabled(httpx_mock: HTTPXMock):
85+
"""
86+
Tests that enabling `redact_pii`, along with the required `redact_pii_policies`
87+
parameter will result in the request body containing those fields
88+
"""
89+
policies = [
90+
aai.types.PIIRedactionPolicy.date,
91+
aai.types.PIIRedactionPolicy.phone_number,
92+
]
93+
94+
request_body, _ = __submit_mock_request(
95+
httpx_mock,
96+
mock_response=factories.generate_dict_factory(
97+
factories.TranscriptCompletedResponseFactory
98+
)(),
99+
config=aai.TranscriptionConfig(
100+
redact_pii=True,
101+
redact_pii_policies=policies,
102+
),
103+
)
104+
105+
assert request_body.get("redact_pii") is True
106+
assert request_body.get("redact_pii_policies") == policies
107+
108+
109+
def test_redact_pii_enabled_with_optional_params(httpx_mock: HTTPXMock):
110+
"""
111+
Tests that enabling `redact_pii`, along with the other optional parameters
112+
relevant to PII redaction, will result in the request body containing
113+
those fields
114+
"""
115+
policies = [
116+
aai.types.PIIRedactionPolicy.date,
117+
aai.types.PIIRedactionPolicy.phone_number,
118+
]
119+
sub_type = aai.types.PIISubstitutionPolicy.entity_name
120+
121+
request_body, _ = __submit_mock_request(
122+
httpx_mock,
123+
mock_response=factories.generate_dict_factory(
124+
factories.TranscriptCompletedResponseFactory
125+
)(),
126+
config=aai.TranscriptionConfig(
127+
redact_pii=True,
128+
redact_pii_audio=True,
129+
redact_pii_policies=policies,
130+
redact_pii_sub=sub_type,
131+
),
132+
)
133+
134+
assert request_body.get("redact_pii") is True
135+
assert request_body.get("redact_pii_audio") is True
136+
assert request_body.get("redact_pii_policies") == policies
137+
assert request_body.get("redact_pii_sub") == sub_type
138+
139+
140+
def test_redact_pii_fails_without_policies(httpx_mock: HTTPXMock):
141+
"""
142+
Tests that enabling `redact_pii` without specifying any policies
143+
will result in an exception being raised before the API call is made
144+
"""
145+
with pytest.raises(ValueError) as error:
146+
__submit_mock_request(
147+
httpx_mock,
148+
mock_response={},
149+
config=aai.TranscriptionConfig(
150+
redact_pii=True,
151+
# No policies!
152+
),
153+
)
154+
155+
assert "policy" in str(error)
156+
157+
# Check that the error was raised before any requests were made
158+
assert len(httpx_mock.get_requests()) == 0
159+
160+
# Inform httpx_mock that it's okay we didn't make any requests
161+
httpx_mock.reset(False)
162+
163+
164+
def test_redact_pii_params_excluded_when_disabled(httpx_mock: HTTPXMock):
165+
"""
166+
Tests that additional PII redaction parameters are excluded from the submission
167+
request body if `redact_pii` itself is not enabled.
168+
"""
169+
request_body, _ = __submit_mock_request(
170+
httpx_mock,
171+
mock_response=factories.generate_dict_factory(
172+
factories.TranscriptCompletedResponseFactory
173+
)(),
174+
config=aai.TranscriptionConfig(
175+
redact_pii=False,
176+
redact_pii_audio=True,
177+
redact_pii_policies=[aai.types.PIIRedactionPolicy.date],
178+
redact_pii_sub=aai.types.PIISubstitutionPolicy.entity_name,
179+
),
180+
)
181+
182+
assert request_body.get("redact_pii") is None
183+
assert request_body.get("redact_pii_audio") is None
184+
assert request_body.get("redact_pii_policies") is None
185+
assert request_body.get("redact_pii_sub") is None

0 commit comments

Comments
 (0)