From 95cad5c435487fbb275b75ab99851c867867f97d Mon Sep 17 00:00:00 2001 From: AssemblyAI Date: Tue, 6 Jun 2023 12:05:35 -0400 Subject: [PATCH 1/3] Project import generated by Copybara. GitOrigin-RevId: ea71d0332ba79dc276e93de36daf33ebb6066493 --- README.md | 86 +++++++++++-------- assemblyai/transcriber.py | 4 + assemblyai/types.py | 36 ++++---- tests/unit/factories.py | 11 +++ tests/unit/test_auto_chapters.py | 140 +++++++++++++++++++++++++++++++ 5 files changed, 229 insertions(+), 48 deletions(-) create mode 100644 tests/unit/test_auto_chapters.py diff --git a/README.md b/README.md index 754eb2a..c420ead 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ With a single API call, get access to AI models built on the latest AI breakthro - [Example](#examples) - [Core Examples](#core-examples) - [LeMUR Examples](#lemur-examples) - - [Audio Intelligence+ Examples](#audio-intelligence-examples) + - [Audio Intelligence Examples](#audio-intelligence-examples) - [Playgrounds](#playgrounds) - [Advanced](#advanced-todo) @@ -159,35 +159,6 @@ print(transcript.text) -
- Summarize the content of a transcript - -```python -import assemblyai as aai - -transcriber = aai.Transcriber() -transcript = transcriber.transcribe( - "https://example.org/audio.mp3", - config=aai.TranscriptionConfig(summarize=True) -) - -print(transcript.summary) -``` - -By default, the summarization model will be `informative` and the summarization type will be `bullets`. [Read more about summarization models and types here](https://www.assemblyai.com/docs/Models/summarization#types-and-models). - -To change the model and/or type, pass additional parameters to the `TranscriptionConfig`: - -```python -config=aai.TranscriptionConfig( - summarize=True, - summary_model=aai.SummarizationModel.catchy, - summary_type=aai.Summarizationtype.headline -) -``` - -
- --- ### **LeMUR Examples** @@ -260,7 +231,7 @@ for result in result: --- -### **Audio Intelligence+ Examples** +### **Audio Intelligence Examples**
PII Redact a Transcript @@ -286,6 +257,57 @@ transcriber = aai.Transcriber() transcript = transcriber.transcribe("https://example.org/audio.mp3", config) ``` +
+
+ Summarize the content of a transcript over time + +```python +import assemblyai as aai + +transcriber = aai.Transcriber() +transcript = transcriber.transcribe( + "https://example.org/audio.mp3", + config=aai.TranscriptionConfig(auto_chapters=True) +) + +for chapter in transcript.chapters: + print(f"Summary: {chapter.summary}") # A one paragraph summary of the content spoken during this timeframe + print(f"Start: {chapter.start}, End: {chapter.end}") # Timestamps (in milliseconds) of the chapter + print(f"Healine: {chapter.headline}") # A single sentence summary of the content spoken during this timeframe + print(f"Gist: {chapter.gist}") # An ultra-short summary, just a few words, of the content spoken during this timeframe +``` + +[Read more about auto chapters here.](https://www.assemblyai.com/docs/Models/auto_chapters) + +
+ +
+ Summarize the content of a transcript + +```python +import assemblyai as aai + +transcriber = aai.Transcriber() +transcript = transcriber.transcribe( + "https://example.org/audio.mp3", + config=aai.TranscriptionConfig(summarization=True) +) + +print(transcript.summary) +``` + +By default, the summarization model will be `informative` and the summarization type will be `bullets`. [Read more about summarization models and types here](https://www.assemblyai.com/docs/Models/summarization#types-and-models). + +To change the model and/or type, pass additional parameters to the `TranscriptionConfig`: + +```python +config=aai.TranscriptionConfig( + summarization=True, + summary_model=aai.SummarizationModel.catchy, + summary_type=aai.SummarizationType.headline +) +``` +
--- @@ -297,7 +319,6 @@ Visit one of our Playgrounds: - [LeMUR Playground](https://www.assemblyai.com/playground/v2/source) - [Transcription Playground](https://www.assemblyai.com/playground) - # Advanced ## How the SDK handles Default Configurations @@ -329,7 +350,6 @@ transcriber = aai.Transcriber() transcriber.config = aai.TranscriptionConfig(punctuate=False, format_text=False) ``` - In case you want to override the `Transcriber`'s configuration for a specific operation with a different one, you can do so via the `config` parameter of a `.transcribe*(...)` method: ```python diff --git a/assemblyai/transcriber.py b/assemblyai/transcriber.py index 5a5ad23..42a9159 100644 --- a/assemblyai/transcriber.py +++ b/assemblyai/transcriber.py @@ -210,6 +210,10 @@ def summary(self) -> Optional[str]: return self._impl.transcript.summary + @property + def chapters(self) -> Optional[List[types.Chapter]]: + return self._impl.transcript.chapters + @property def status(self) -> types.TranscriptStatus: "The current status of the transcript" diff --git a/assemblyai/types.py b/assemblyai/types.py index 647947f..6b87aaf 100644 --- a/assemblyai/types.py +++ b/assemblyai/types.py @@ -354,8 +354,8 @@ class RawTranscriptionConfig(BaseModel): # sentiment_analysis: bool = False # "Enable Sentiment Analysis." - # auto_chapters: bool = False - # "Enable Auto Chapters." + auto_chapters: Optional[bool] + "Enable Auto Chapters." # entity_detection: bool = False # "Enable Entity Detection." @@ -415,7 +415,7 @@ def __init__( custom_spelling: Optional[Dict[str, Union[str, Sequence[str]]]] = None, disfluencies: Optional[bool] = None, # sentiment_analysis: bool = False, - # auto_chapters: bool = False, + auto_chapters: Optional[bool] = None, # entity_detection: bool = False, summarization: Optional[bool] = None, summary_model: Optional[SummarizationModel] = None, @@ -491,7 +491,7 @@ def __init__( self.set_custom_spelling(custom_spelling, override=True) self.disfluencies = disfluencies # self.sentiment_analysis = sentiment_analysis - # self.auto_chapters = auto_chapters + self.auto_chapters = auto_chapters # self.entity_detection = entity_detection self.set_summarize( summarization, @@ -707,17 +707,23 @@ def disfluencies(self, enable: Optional[bool]) -> None: # self._raw_transcription_config.sentiment_analysis = enable - # @property - # def auto_chapters(self) -> bool: - # "Returns the status of the Auto Chapters feature." + @property + def auto_chapters(self) -> bool: + "Returns the status of the Auto Chapters feature." + + return self._raw_transcription_config.auto_chapters - # return self._raw_transcription_config.auto_chapters + @auto_chapters.setter + def auto_chapters(self, enable: bool) -> None: + "Enable Auto Chapters." - # @auto_chapters.setter - # def auto_chapters(self, enable: bool) -> None: - # "Enable Auto Chapters." + # Validate required params are also set + if self.punctuate == False: + raise ValueError( + "If `auto_chapters` is enabled, then `punctuate` must not be disabled" + ) - # self._raw_transcription_config.auto_chapters = enable + self._raw_transcription_config.auto_chapters = enable # @property # def entity_detection(self) -> bool: @@ -1317,8 +1323,8 @@ class BaseTranscript(BaseModel): # sentiment_analysis: bool = False # "Enable Sentiment Analysis." - # auto_chapters: bool = False - # "Enable Auto Chapters." + auto_chapters: Optional[bool] + "Enable Auto Chapters." # entity_detection: bool = False # "Enable Entity Detection." @@ -1401,7 +1407,7 @@ class TranscriptResponse(BaseTranscript): # iab_categories_result: Optional[IABResponse] = None # "The list of results when Topic Detection is enabled" - # chapters: Optional[List[Chapter]] = None + chapters: Optional[List[Chapter]] # "When Auto Chapters is enabled, the list of Auto Chapters results" # sentiment_analysis_results: Optional[List[Sentiment]] = None diff --git a/tests/unit/factories.py b/tests/unit/factories.py index d7fbdd8..c9eb34c 100644 --- a/tests/unit/factories.py +++ b/tests/unit/factories.py @@ -37,6 +37,17 @@ class Meta: words = factory.List([factory.SubFactory(UtteranceWordFactory)]) +class ChapterFactory(factory.Factory): + class Meta: + model = types.Chapter + + summary = factory.Faker("sentence") + headline = factory.Faker("sentence") + gist = factory.Faker("sentence") + start = factory.Faker("pyint") + end = factory.Faker("pyint") + + class BaseTranscriptFactory(factory.Factory): class Meta: model = types.BaseTranscript diff --git a/tests/unit/test_auto_chapters.py b/tests/unit/test_auto_chapters.py new file mode 100644 index 0000000..aa729cd --- /dev/null +++ b/tests/unit/test_auto_chapters.py @@ -0,0 +1,140 @@ +import json +from typing import Any, Dict, Tuple + +import factory +import httpx +import pytest +from pytest_httpx import HTTPXMock + +import assemblyai.developer_tools.python.sdk.tests.unit.factories as factories +import assemblyai as aai + +aai.settings.api_key = "test" + + +class AutoChaptersResponseFactory(factories.TranscriptCompletedResponseFactory): + chapters = factory.List([factory.SubFactory(factories.ChapterFactory)]) + + +def __submit_mock_request( + httpx_mock: HTTPXMock, + mock_response: Dict[str, Any], + config: aai.TranscriptionConfig, +) -> Tuple[Dict[str, Any], aai.Transcript]: + """ + Helper function to abstract mock transcriber calls with given `TranscriptionConfig`, + and perform some common assertions. + """ + + mock_transcript_id = mock_response.get("id", "mock_id") + + # Mock initial submission response (transcript is processing) + mock_processing_response = factories.generate_dict_factory( + factories.TranscriptProcessingResponseFactory + )() + + httpx_mock.add_response( + url=f"{aai.settings.base_url}/transcript", + status_code=httpx.codes.OK, + method="POST", + json={ + **mock_processing_response, + "id": mock_transcript_id, # inject ID from main mock response + }, + ) + + # Mock polling-for-completeness response, with completed transcript + httpx_mock.add_response( + url=f"{aai.settings.base_url}/transcript/{mock_transcript_id}", + status_code=httpx.codes.OK, + method="GET", + json=mock_response, + ) + + # == Make API request via SDK == + transcript = aai.Transcriber().transcribe( + data="https://example.org/audio.wav", + config=config, + ) + + # Check that submission and polling requests were made + assert len(httpx_mock.get_requests()) == 2 + + # Extract body of initial submission request + request = httpx_mock.get_requests()[0] + request_body = json.loads(request.content.decode()) + + return request_body, transcript + + +def test_auto_chapters_fails_without_punctuation(httpx_mock: HTTPXMock): + """ + Tests whether the SDK raises an error before making a request + if `auto_chapters` is enabled and `punctuation` is disabled + """ + + with pytest.raises(ValueError) as error: + __submit_mock_request( + httpx_mock, + mock_response={}, # response doesn't matter, since it shouldn't occur + config=aai.TranscriptionConfig( + auto_chapters=True, + punctuate=False, + ), + ) + # Check that the error message informs the user of the invalid parameter + assert "punctuate" in str(error) + + # Check that the error was raised before any requests were made + assert len(httpx_mock.get_requests()) == 0 + + # Inform httpx_mock that it's okay we didn't make any requests + httpx_mock.reset(False) + + +def test_auto_chapters_disabled_by_default(httpx_mock: HTTPXMock): + """ + Tests that excluding `auto_chapters` from the `TranscriptionConfig` will + result in the default behavior of it being excluded from the request body + """ + request_body, transcript = __submit_mock_request( + httpx_mock, + mock_response=factories.generate_dict_factory( + factories.TranscriptCompletedResponseFactory + )(), + config=aai.TranscriptionConfig(), + ) + assert request_body.get("auto_chapters") is None + assert transcript.chapters is None + + +def test_auto_chapters_enabled(httpx_mock: HTTPXMock): + """ + Tests that including `auto_chapters=True` in the `TranscriptionConfig` + will result in `auto_chapters=True` in the request body, and that the + response is properly parsed into a `Transcript` object + """ + mock_response = factories.generate_dict_factory(AutoChaptersResponseFactory)() + request_body, transcript = __submit_mock_request( + httpx_mock, + mock_response=mock_response, + config=aai.TranscriptionConfig(auto_chapters=True), + ) + + # Check that request body was properly defined + assert request_body.get("auto_chapters") == True + + # Check that transcript was properly parsed from JSON response + assert transcript.error is None + assert transcript.chapters is not None + assert len(transcript.chapters) > 0 + assert len(transcript.chapters) == len(mock_response["chapters"]) + + for response_chapter, transcript_chapter in zip( + mock_response["chapters"], transcript.chapters + ): + assert transcript_chapter.summary == response_chapter["summary"] + assert transcript_chapter.headline == response_chapter["headline"] + assert transcript_chapter.gist == response_chapter["gist"] + assert transcript_chapter.start == response_chapter["start"] + assert transcript_chapter.end == response_chapter["end"] From 316008c6ada87daadf0980b068a9d908e92ab400 Mon Sep 17 00:00:00 2001 From: Soheyl Date: Wed, 7 Jun 2023 10:14:44 +0200 Subject: [PATCH 2/3] fix: bad import --- tests/unit/test_auto_chapters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_auto_chapters.py b/tests/unit/test_auto_chapters.py index aa729cd..1ab5e3b 100644 --- a/tests/unit/test_auto_chapters.py +++ b/tests/unit/test_auto_chapters.py @@ -6,8 +6,8 @@ import pytest from pytest_httpx import HTTPXMock -import assemblyai.developer_tools.python.sdk.tests.unit.factories as factories import assemblyai as aai +from tests.unit import factories aai.settings.api_key = "test" From ab95cf21243590b9e598eec3a79a2e7bef028cbf Mon Sep 17 00:00:00 2001 From: Soheyl Date: Wed, 7 Jun 2023 10:15:17 +0200 Subject: [PATCH 3/3] build: bump version to `0.7.0` --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6cffa7b..fb249e6 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="assemblyai", - version="0.6.0", + version="0.7.0", description="AssemblyAI Python SDK", author="AssemblyAI", author_email="engineering.sdk@assemblyai.com",