diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 0b2f3644..0fe38b5a 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -48,3 +48,18 @@ Here are a few types of contributions that we would be interested in hearing abo ## Making Code Contributions for those interested in contributing code to the project, please review the [Code Contribution Guide](https://github.com/deepgram/deepgram-python-sdk/blob/main/.github/CODE_CONTRIBUTIONS_GUIDE.md) for more details. + +## Building Locally + +Assuming you are using `pipenv`: + +```bash +# Install deps +pipenv install +# Build package +pipenv run python3 -m build +# Install package from local build +pipenv install ./dist/deepgram_sdk-0.0.0.tar.gz +# Try an example! +DEEPGRAM_API_KEY= pipenv run python3 examples/agent/async_simple/main.py +``` \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8e4929ee..910c97f3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ dist/ build/ poetry.lock +# examples +chatlog.txt +output_*.wav diff --git a/deepgram/__init__.py b/deepgram/__init__.py index 3873d7fa..95f056d3 100644 --- a/deepgram/__init__.py +++ b/deepgram/__init__.py @@ -324,7 +324,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -333,27 +332,29 @@ from .client import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) # utilities diff --git a/deepgram/client.py b/deepgram/client.py index 5c3e973e..c9d20fd1 100644 --- a/deepgram/client.py +++ b/deepgram/client.py @@ -338,7 +338,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -347,27 +346,29 @@ from .clients import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) diff --git a/deepgram/clients/__init__.py b/deepgram/clients/__init__.py index 75f46213..0f573d2e 100644 --- a/deepgram/clients/__init__.py +++ b/deepgram/clients/__init__.py @@ -347,7 +347,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -356,25 +355,27 @@ from .agent import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) diff --git a/deepgram/clients/agent/__init__.py b/deepgram/clients/agent/__init__.py index 33988571..63c598ef 100644 --- a/deepgram/clients/agent/__init__.py +++ b/deepgram/clients/agent/__init__.py @@ -22,7 +22,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -31,25 +30,27 @@ from .client import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) diff --git a/deepgram/clients/agent/client.py b/deepgram/clients/agent/client.py index ad3b868e..6dbfd08f 100644 --- a/deepgram/clients/agent/client.py +++ b/deepgram/clients/agent/client.py @@ -21,7 +21,6 @@ ConversationTextResponse as LatestConversationTextResponse, UserStartedSpeakingResponse as LatestUserStartedSpeakingResponse, AgentThinkingResponse as LatestAgentThinkingResponse, - FunctionCalling as LatestFunctionCalling, FunctionCallRequest as LatestFunctionCallRequest, AgentStartedSpeakingResponse as LatestAgentStartedSpeakingResponse, AgentAudioDoneResponse as LatestAgentAudioDoneResponse, @@ -30,27 +29,29 @@ from .v1 import ( # top level - SettingsConfigurationOptions as LatestSettingsConfigurationOptions, - UpdateInstructionsOptions as LatestUpdateInstructionsOptions, + SettingsOptions as LatestSettingsOptions, + UpdatePromptOptions as LatestUpdatePromptOptions, UpdateSpeakOptions as LatestUpdateSpeakOptions, InjectAgentMessageOptions as LatestInjectAgentMessageOptions, FunctionCallResponse as LatestFunctionCallResponse, AgentKeepAlive as LatestAgentKeepAlive, # sub level Listen as LatestListen, + ListenProvider as LatestListenProvider, Speak as LatestSpeak, + SpeakProvider as LatestSpeakProvider, Header as LatestHeader, Item as LatestItem, Properties as LatestProperties, Parameters as LatestParameters, Function as LatestFunction, - Provider as LatestProvider, Think as LatestThink, + ThinkProvider as LatestThinkProvider, Agent as LatestAgent, Input as LatestInput, Output as LatestOutput, Audio as LatestAudio, - Context as LatestContext, + Endpoint as LatestEndpoint, ) @@ -70,31 +71,32 @@ ConversationTextResponse = LatestConversationTextResponse UserStartedSpeakingResponse = LatestUserStartedSpeakingResponse AgentThinkingResponse = LatestAgentThinkingResponse -FunctionCalling = LatestFunctionCalling FunctionCallRequest = LatestFunctionCallRequest AgentStartedSpeakingResponse = LatestAgentStartedSpeakingResponse AgentAudioDoneResponse = LatestAgentAudioDoneResponse InjectionRefusedResponse = LatestInjectionRefusedResponse -SettingsConfigurationOptions = LatestSettingsConfigurationOptions -UpdateInstructionsOptions = LatestUpdateInstructionsOptions +SettingsOptions = LatestSettingsOptions +UpdatePromptOptions = LatestUpdatePromptOptions UpdateSpeakOptions = LatestUpdateSpeakOptions InjectAgentMessageOptions = LatestInjectAgentMessageOptions FunctionCallResponse = LatestFunctionCallResponse AgentKeepAlive = LatestAgentKeepAlive Listen = LatestListen +ListenProvider = LatestListenProvider Speak = LatestSpeak +SpeakProvider = LatestSpeakProvider Header = LatestHeader Item = LatestItem Properties = LatestProperties Parameters = LatestParameters Function = LatestFunction -Provider = LatestProvider Think = LatestThink +ThinkProvider = LatestThinkProvider Agent = LatestAgent Input = LatestInput Output = LatestOutput Audio = LatestAudio -Context = LatestContext +Endpoint = LatestEndpoint \ No newline at end of file diff --git a/deepgram/clients/agent/enums.py b/deepgram/clients/agent/enums.py index df20debc..e0928dd2 100644 --- a/deepgram/clients/agent/enums.py +++ b/deepgram/clients/agent/enums.py @@ -21,7 +21,6 @@ class AgentWebSocketEvents(StrEnum): ConversationText: str = "ConversationText" UserStartedSpeaking: str = "UserStartedSpeaking" AgentThinking: str = "AgentThinking" - FunctionCalling: str = "FunctionCalling" FunctionCallRequest: str = "FunctionCallRequest" AgentStartedSpeaking: str = "AgentStartedSpeaking" AgentAudioDone: str = "AgentAudioDone" @@ -29,8 +28,8 @@ class AgentWebSocketEvents(StrEnum): Unhandled: str = "Unhandled" # client - SettingsConfiguration: str = "SettingsConfiguration" - UpdateInstructions: str = "UpdateInstructions" + Settings: str = "Settings" + UpdatePrompt: str = "UpdatePrompt" UpdateSpeak: str = "UpdateSpeak" InjectAgentMessage: str = "InjectAgentMessage" InjectionRefused: str = "InjectionRefused" diff --git a/deepgram/clients/agent/v1/__init__.py b/deepgram/clients/agent/v1/__init__.py index 305cb891..8d48b80b 100644 --- a/deepgram/clients/agent/v1/__init__.py +++ b/deepgram/clients/agent/v1/__init__.py @@ -26,7 +26,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -35,25 +34,27 @@ from .websocket import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) diff --git a/deepgram/clients/agent/v1/websocket/__init__.py b/deepgram/clients/agent/v1/websocket/__init__.py index e2d5cdba..b1cec3f2 100644 --- a/deepgram/clients/agent/v1/websocket/__init__.py +++ b/deepgram/clients/agent/v1/websocket/__init__.py @@ -18,7 +18,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -26,25 +25,27 @@ ) from .options import ( # top level - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, AgentKeepAlive, # sub level Listen, + ListenProvider, Speak, + SpeakProvider, Header, Item, Properties, Parameters, Function, - Provider, Think, + ThinkProvider, Agent, Input, Output, Audio, - Context, + Endpoint, ) diff --git a/deepgram/clients/agent/v1/websocket/async_client.py b/deepgram/clients/agent/v1/websocket/async_client.py index 65f7a3bc..46524468 100644 --- a/deepgram/clients/agent/v1/websocket/async_client.py +++ b/deepgram/clients/agent/v1/websocket/async_client.py @@ -21,7 +21,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -31,8 +30,8 @@ UnhandledResponse, ) from .options import ( - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, @@ -79,7 +78,7 @@ class AsyncAgentWebSocketClient( _kwargs: Optional[Dict] = None _addons: Optional[Dict] = None # note the distinction here. We can't use _config because it's already used in the parent - _settings: Optional[SettingsConfigurationOptions] = None + _settings: Optional[SettingsOptions] = None _headers: Optional[Dict] = None _speaker_created: bool = False @@ -98,7 +97,7 @@ def __init__(self, config: DeepgramClientOptions): self._config = config # needs to be "wss://agent.deepgram.com/agent" - self._endpoint = "agent" + self._endpoint = "v1/agent/converse" # override the endpoint since it needs to be "wss://agent.deepgram.com/agent" self._config.url = "agent.deepgram.com" @@ -179,7 +178,7 @@ def __init__(self, config: DeepgramClientOptions): # pylint: disable=too-many-branches,too-many-statements async def start( self, - options: Optional[SettingsConfigurationOptions] = None, + options: Optional[SettingsOptions] = None, addons: Optional[Dict] = None, headers: Optional[Dict] = None, members: Optional[Dict] = None, @@ -195,7 +194,7 @@ async def start( self._logger.info("members: %s", members) self._logger.info("kwargs: %s", kwargs) - if isinstance(options, SettingsConfigurationOptions) and not options.check(): + if isinstance(options, SettingsOptions) and not options.check(): self._logger.error("settings.check failed") self._logger.debug("AsyncAgentWebSocketClient.start LEAVE") raise DeepgramError("Fatal agent settings error") @@ -213,19 +212,19 @@ async def start( else: self._kwargs = {} - if isinstance(options, SettingsConfigurationOptions): + if isinstance(options, SettingsOptions): self._logger.info("options is class") self._settings = options elif isinstance(options, dict): self._logger.info("options is dict") - self._settings = SettingsConfigurationOptions.from_dict(options) + self._settings = SettingsOptions.from_dict(options) elif isinstance(options, str): self._logger.info("options is json") - self._settings = SettingsConfigurationOptions.from_json(options) + self._settings = SettingsOptions.from_json(options) else: raise DeepgramError("Invalid options type") - if self._settings.agent.listen.keyterms is not None and self._settings.agent.listen.model is not None and not self._settings.agent.listen.model.startswith("nova-3"): + if self._settings.agent.listen.provider.keyterms is not None and self._settings.agent.listen.provider.model is not None and not self._settings.agent.listen.provider.model.startswith("nova-3"): raise DeepgramError("Keyterms are only supported for nova-3 models") try: @@ -277,14 +276,14 @@ async def start( self._logger.debug("number of active threads: %s", threading.active_count()) # send the configurationsetting message - self._logger.notice("Sending ConfigurationSettings...") + self._logger.notice("Sending Settings...") ret_send_cs = await self.send(str(self._settings)) if not ret_send_cs: - self._logger.error("ConfigurationSettings failed") + self._logger.error("Settings failed") err_error: ErrorResponse = ErrorResponse( "Exception in AsyncAgentWebSocketClient.start", - "ConfigurationSettings failed to send", + "Settings failed to send", "Exception", ) await self._emit( @@ -432,16 +431,6 @@ async def _process_text(self, message: str) -> None: agent_thinking=agent_thinking_result, **dict(cast(Dict[Any, Any], self._kwargs)), ) - case AgentWebSocketEvents.FunctionCalling: - function_calling_result: FunctionCalling = ( - FunctionCalling.from_json(message) - ) - self._logger.verbose("FunctionCalling: %s", function_calling_result) - await self._emit( - AgentWebSocketEvents(AgentWebSocketEvents.FunctionCalling), - function_calling=function_calling_result, - **dict(cast(Dict[Any, Any], self._kwargs)), - ) case AgentWebSocketEvents.FunctionCallRequest: function_call_request_result: FunctionCallRequest = ( FunctionCallRequest.from_json(message) diff --git a/deepgram/clients/agent/v1/websocket/client.py b/deepgram/clients/agent/v1/websocket/client.py index 9b921007..fae7830b 100644 --- a/deepgram/clients/agent/v1/websocket/client.py +++ b/deepgram/clients/agent/v1/websocket/client.py @@ -21,7 +21,6 @@ ConversationTextResponse, UserStartedSpeakingResponse, AgentThinkingResponse, - FunctionCalling, FunctionCallRequest, AgentStartedSpeakingResponse, AgentAudioDoneResponse, @@ -31,8 +30,8 @@ UnhandledResponse, ) from .options import ( - SettingsConfigurationOptions, - UpdateInstructionsOptions, + SettingsOptions, + UpdatePromptOptions, UpdateSpeakOptions, InjectAgentMessageOptions, FunctionCallResponse, @@ -79,7 +78,7 @@ class AgentWebSocketClient( _kwargs: Optional[Dict] = None _addons: Optional[Dict] = None # note the distinction here. We can't use _config because it's already used in the parent - _settings: Optional[SettingsConfigurationOptions] = None + _settings: Optional[SettingsOptions] = None _headers: Optional[Dict] = None _speaker_created: bool = False @@ -98,7 +97,7 @@ def __init__(self, config: DeepgramClientOptions): self._config = config # needs to be "wss://agent.deepgram.com/agent" - self._endpoint = "agent" + self._endpoint = "v1/agent/converse" # override the endpoint since it needs to be "wss://agent.deepgram.com/agent" self._config.url = "agent.deepgram.com" @@ -180,7 +179,7 @@ def __init__(self, config: DeepgramClientOptions): # pylint: disable=too-many-statements,too-many-branches def start( self, - options: Optional[SettingsConfigurationOptions] = None, + options: Optional[SettingsOptions] = None, addons: Optional[Dict] = None, headers: Optional[Dict] = None, members: Optional[Dict] = None, @@ -196,7 +195,7 @@ def start( self._logger.info("members: %s", members) self._logger.info("kwargs: %s", kwargs) - if isinstance(options, SettingsConfigurationOptions) and not options.check(): + if isinstance(options, SettingsOptions) and not options.check(): self._logger.error("settings.check failed") self._logger.debug("AgentWebSocketClient.start LEAVE") raise DeepgramError("Fatal agent settings error") @@ -214,19 +213,24 @@ def start( else: self._kwargs = {} - if isinstance(options, SettingsConfigurationOptions): + if isinstance(options, SettingsOptions): self._logger.info("options is class") self._settings = options elif isinstance(options, dict): self._logger.info("options is dict") - self._settings = SettingsConfigurationOptions.from_dict(options) + self._settings = SettingsOptions.from_dict(options) elif isinstance(options, str): self._logger.info("options is json") - self._settings = SettingsConfigurationOptions.from_json(options) + self._settings = SettingsOptions.from_json(options) else: raise DeepgramError("Invalid options type") - if self._settings.agent.listen.keyterms is not None and self._settings.agent.listen.model is not None and not self._settings.agent.listen.model.startswith("nova-3"): + if ( + self._settings.agent.listen.provider + and self._settings.agent.listen.provider.keyterms is not None + and self._settings.agent.listen.provider.model is not None + and not self._settings.agent.listen.provider.model.startswith("nova-3") + ): raise DeepgramError("Keyterms are only supported for nova-3 models") try: @@ -278,15 +282,15 @@ def start( self._logger.debug("after running thread: %s", thread.name) self._logger.debug("number of active threads: %s", threading.active_count()) - # send the configurationsetting message - self._logger.notice("Sending ConfigurationSettings...") + # send the Settings message + self._logger.notice("Sending Settings...") ret_send_cs = self.send(str(self._settings)) if not ret_send_cs: - self._logger.error("ConfigurationSettings failed") + self._logger.error("Settings failed") err_error: ErrorResponse = ErrorResponse( "Exception in AgentWebSocketClient.start", - "ConfigurationSettings failed to send", + "Settings failed to send", "Exception", ) self._emit( @@ -427,16 +431,6 @@ def _process_text(self, message: str) -> None: agent_thinking=agent_thinking_result, **dict(cast(Dict[Any, Any], self._kwargs)), ) - case AgentWebSocketEvents.FunctionCalling: - function_calling_result: FunctionCalling = ( - FunctionCalling.from_json(message) - ) - self._logger.verbose("FunctionCalling: %s", function_calling_result) - self._emit( - AgentWebSocketEvents(AgentWebSocketEvents.FunctionCalling), - function_calling=function_calling_result, - **dict(cast(Dict[Any, Any], self._kwargs)), - ) case AgentWebSocketEvents.FunctionCallRequest: function_call_request_result: FunctionCallRequest = ( FunctionCallRequest.from_json(message) diff --git a/deepgram/clients/agent/v1/websocket/options.py b/deepgram/clients/agent/v1/websocket/options.py index e9c4961c..8ecea60c 100644 --- a/deepgram/clients/agent/v1/websocket/options.py +++ b/deepgram/clients/agent/v1/websocket/options.py @@ -17,38 +17,6 @@ # ConfigurationSettings -@dataclass -class Listen(BaseResponse): - """ - This class defines any configuration settings for the Listen model. - """ - - model: Optional[str] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) - keyterms: Optional[List[str]] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) - - -@dataclass -class Speak(BaseResponse): - """ - This class defines any configuration settings for the Speak model. - """ - - model: Optional[str] = field( - default="aura-asteria-en", - metadata=dataclass_config(exclude=lambda f: f is None), - ) - provider: Optional[str] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) - voice_id: Optional[str] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) - - @dataclass class Header(BaseResponse): """ @@ -101,6 +69,27 @@ def __getitem__(self, key): return _dict[key] +@dataclass +class Endpoint(BaseResponse): + """ + Define a custom endpoint for the agent. + """ + + method: Optional[str] = field(default="POST") + url: str = field(default="") + headers: Optional[List[Header]] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + + def __getitem__(self, key): + _dict = self.to_dict() + if "headers" in _dict: + _dict["headers"] = [ + Header.from_dict(headers) for headers in _dict["headers"] + ] + return _dict[key] + + @dataclass class Function(BaseResponse): """ @@ -117,58 +106,175 @@ class Function(BaseResponse): parameters: Optional[Parameters] = field( default=None, metadata=dataclass_config(exclude=lambda f: f is None) ) + endpoint: Optional[Endpoint] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) def __getitem__(self, key): _dict = self.to_dict() - if "parameters" in _dict: - _dict["parameters"] = [ - Parameters.from_dict(parameters) for parameters in _dict["parameters"] - ] - if "headers" in _dict: - _dict["headers"] = [ - Header.from_dict(headers) for headers in _dict["headers"] - ] + if "parameters" in _dict and isinstance(_dict["parameters"], dict): + _dict["parameters"] = Parameters.from_dict(_dict["parameters"]) + if "headers" in _dict and isinstance(_dict["headers"], list): + _dict["headers"] = [Header.from_dict(header) for header in _dict["headers"]] + if "endpoint" in _dict and isinstance(_dict["endpoint"], dict): + _dict["endpoint"] = Endpoint.from_dict(_dict["endpoint"]) + return _dict[key] + + +@dataclass +class CartesiaVoice(BaseResponse): + """ + This class defines the voice for the Cartesia model. + """ + + mode: str = field( + default="", metadata=dataclass_config(exclude=lambda f: f is None or f == "") + ) + id: str = field( + default="", metadata=dataclass_config(exclude=lambda f: f is None or f == "") + ) + + +@dataclass +class ListenProvider(BaseResponse): + """ + This class defines the provider for the Listen model. + """ + + type: str = field(default="") + model: str = field(default="") + keyterms: Optional[List[str]] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + + def __getitem__(self, key): + _dict = self.to_dict() + if "keyterms" in _dict and isinstance(_dict["keyterms"], list): + _dict["keyterms"] = [str(keyterm) for keyterm in _dict["keyterms"]] return _dict[key] @dataclass -class Provider(BaseResponse): +class ThinkProvider(BaseResponse): """ This class defines the provider for the Think model. """ - type: Optional[str] = field( + type: Optional[str] = field(default=None) + model: Optional[str] = field(default=None) + temperature: Optional[float] = field( default=None, metadata=dataclass_config(exclude=lambda f: f is None) ) @dataclass -class Think(BaseResponse): +class SpeakProvider(BaseResponse): """ - This class defines any configuration settings for the Think model. + This class defines the provider for the Speak model. """ - provider: Provider = field(default_factory=Provider) + type: Optional[str] = field(default="deepgram") + """ + Deepgram OR OpenAI model to use. + """ model: Optional[str] = field( + default="aura-2-thalia-en", + metadata=dataclass_config(exclude=lambda f: f is None), + ) + """ + ElevenLabs or Cartesia model to use. + """ + model_id: Optional[str] = field( default=None, metadata=dataclass_config(exclude=lambda f: f is None) ) - instructions: Optional[str] = field( + """ + Cartesia voice configuration. + """ + voice: Optional[CartesiaVoice] = field( default=None, metadata=dataclass_config(exclude=lambda f: f is None) ) + """ + Cartesia language. + """ + language: Optional[str] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + """ + ElevenLabs language. + """ + language_code: Optional[str] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + + def __getitem__(self, key): + _dict = self.to_dict() + if "voice" in _dict and isinstance(_dict["voice"], dict): + _dict["voice"] = CartesiaVoice.from_dict(_dict["voice"]) + return _dict[key] + + +@dataclass +class Think(BaseResponse): + """ + This class defines any configuration settings for the Think model. + """ + + provider: ThinkProvider = field(default_factory=ThinkProvider) functions: Optional[List[Function]] = field( default=None, metadata=dataclass_config(exclude=lambda f: f is None) ) + endpoint: Optional[Endpoint] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + prompt: Optional[str] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) def __getitem__(self, key): _dict = self.to_dict() - if "provider" in _dict: - _dict["provider"] = [ - Provider.from_dict(provider) for provider in _dict["provider"] - ] - if "functions" in _dict: + if "provider" in _dict and isinstance(_dict["provider"], dict): + _dict["provider"] = ThinkProvider.from_dict(_dict["provider"]) + if "functions" in _dict and isinstance(_dict["functions"], list): _dict["functions"] = [ - Function.from_dict(functions) for functions in _dict["functions"] + Function.from_dict(function) for function in _dict["functions"] ] + if "endpoint" in _dict and isinstance(_dict["endpoint"], dict): + _dict["endpoint"] = Endpoint.from_dict(_dict["endpoint"]) + return _dict[key] + + +@dataclass +class Listen(BaseResponse): + """ + This class defines any configuration settings for the Listen model. + """ + + provider: ListenProvider = field(default_factory=ListenProvider) + + def __getitem__(self, key): + _dict = self.to_dict() + if "provider" in _dict and isinstance(_dict["provider"], dict): + _dict["provider"] = ListenProvider.from_dict(_dict["provider"]) + return _dict[key] + + +@dataclass +class Speak(BaseResponse): + """ + This class defines any configuration settings for the Speak model. + """ + + provider: SpeakProvider = field(default_factory=SpeakProvider) + endpoint: Optional[Endpoint] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) + + def __getitem__(self, key): + _dict = self.to_dict() + if "provider" in _dict and isinstance(_dict["provider"], dict): + _dict["provider"] = SpeakProvider.from_dict(_dict["provider"]) + if "endpoint" in _dict and isinstance(_dict["endpoint"], dict): + _dict["endpoint"] = Endpoint.from_dict(_dict["endpoint"]) return _dict[key] @@ -181,15 +287,18 @@ class Agent(BaseResponse): listen: Listen = field(default_factory=Listen) think: Think = field(default_factory=Think) speak: Speak = field(default_factory=Speak) + greeting: Optional[str] = field( + default=None, metadata=dataclass_config(exclude=lambda f: f is None) + ) def __getitem__(self, key): _dict = self.to_dict() - if "listen" in _dict: - _dict["listen"] = [Listen.from_dict(listen) for listen in _dict["listen"]] - if "think" in _dict: - _dict["think"] = [Think.from_dict(think) for think in _dict["think"]] - if "speak" in _dict: - _dict["speak"] = [Speak.from_dict(speak) for speak in _dict["speak"]] + if "listen" in _dict and isinstance(_dict["listen"], dict): + _dict["listen"] = Listen.from_dict(_dict["listen"]) + if "think" in _dict and isinstance(_dict["think"], dict): + _dict["think"] = Think.from_dict(_dict["think"]) + if "speak" in _dict and isinstance(_dict["speak"], dict): + _dict["speak"] = Speak.from_dict(_dict["speak"]) return _dict[key] @@ -228,54 +337,39 @@ class Audio(BaseResponse): def __getitem__(self, key): _dict = self.to_dict() - if "input" in _dict: - _dict["input"] = [Input.from_dict(input) for input in _dict["input"]] - if "output" in _dict: - _dict["output"] = [Output.from_dict(output) for output in _dict["output"]] + if "input" in _dict and isinstance(_dict["input"], dict): + _dict["input"] = Input.from_dict(_dict["input"]) + if "output" in _dict and isinstance(_dict["output"], dict): + _dict["output"] = Output.from_dict(_dict["output"]) return _dict[key] @dataclass -class Context(BaseResponse): +class Language(BaseResponse): """ - This class defines any configuration settings for the context. + Define the language for the agent. """ - messages: Optional[List[Tuple[str, str]]] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) - replay: Optional[bool] = field(default=False) - - def __getitem__(self, key): - _dict = self.to_dict() - if "messages" in _dict: - _dict["messages"] = _dict["messages"].copy() - return _dict[key] + type: str = field(default="en") @dataclass -class SettingsConfigurationOptions(BaseResponse): +class SettingsOptions(BaseResponse): """ - The client should send a SettingsConfiguration message immediately after opening the websocket and before sending any audio. + The client should send a Settings message immediately after opening the websocket and before sending any audio. """ - type: str = str(AgentWebSocketEvents.SettingsConfiguration) + experimental: Optional[bool] = field(default=False) + type: str = str(AgentWebSocketEvents.Settings) audio: Audio = field(default_factory=Audio) agent: Agent = field(default_factory=Agent) - context: Optional[Context] = field( - default=None, metadata=dataclass_config(exclude=lambda f: f is None) - ) def __getitem__(self, key): _dict = self.to_dict() - if "audio" in _dict: - _dict["audio"] = [Audio.from_dict(audio) for audio in _dict["audio"]] - if "agent" in _dict: - _dict["agent"] = [Agent.from_dict(agent) for agent in _dict["agent"]] - if "context" in _dict: - _dict["context"] = [ - Context.from_dict(context) for context in _dict["context"] - ] + if "audio" in _dict and isinstance(_dict["audio"], dict): + _dict["audio"] = Audio.from_dict(_dict["audio"]) + if "agent" in _dict and isinstance(_dict["agent"], dict): + _dict["agent"] = Agent.from_dict(_dict["agent"]) return _dict[key] def check(self): @@ -294,17 +388,17 @@ def check(self): return True -# UpdateInstructions +# UpdatePrompt @dataclass -class UpdateInstructionsOptions(BaseResponse): +class UpdatePromptOptions(BaseResponse): """ - The client can send an UpdateInstructions message to give additional instructions to the Think model in the middle of a conversation. + The client can send an UpdatePrompt message to provide a new prompt to the Think model in the middle of a conversation. """ - type: str = str(AgentWebSocketEvents.UpdateInstructions) - instructions: str = field(default="") + type: str = str(AgentWebSocketEvents.UpdatePrompt) + prompt: str = field(default="") # UpdateSpeak @@ -317,7 +411,7 @@ class UpdateSpeakOptions(BaseResponse): """ type: str = str(AgentWebSocketEvents.UpdateSpeak) - model: str = field(default="") + speak: Speak = field(default_factory=Speak) # InjectAgentMessage diff --git a/deepgram/clients/agent/v1/websocket/response.py b/deepgram/clients/agent/v1/websocket/response.py index f4cc737e..c0dc226d 100644 --- a/deepgram/clients/agent/v1/websocket/response.py +++ b/deepgram/clients/agent/v1/websocket/response.py @@ -25,7 +25,7 @@ class WelcomeResponse(BaseResponse): """ type: str - session_id: str + request_id: str @dataclass @@ -61,21 +61,13 @@ class UserStartedSpeakingResponse(BaseResponse): class AgentThinkingResponse(BaseResponse): """ The server will send an AgentThinking message to inform the client of a non-verbalized agent thought. + You will ONLY receive this message if you have set `experimental` to true. """ type: str content: str -@dataclass -class FunctionCalling(BaseResponse): - """ - The server will sometimes send FunctionCalling messages when making function calls to help the client developer debug function calling workflows. - """ - - type: str - - @dataclass class FunctionCallRequest(BaseResponse): """ diff --git a/examples/agent/async_simple/main.py b/examples/agent/async_simple/main.py index 3c04b090..ca14e96b 100644 --- a/examples/agent/async_simple/main.py +++ b/examples/agent/async_simple/main.py @@ -11,7 +11,7 @@ DeepgramClient, DeepgramClientOptions, AgentWebSocketEvents, - SettingsConfigurationOptions, + SettingsOptions, ) TTS_TEXT = "Hello, this is a text to speech example using Deepgram." @@ -73,9 +73,6 @@ async def on_user_started_speaking(self, user_started_speaking, **kwargs): async def on_agent_thinking(self, agent_thinking, **kwargs): print(f"\n\n{agent_thinking}\n\n") - async def on_function_calling(self, function_calling, **kwargs): - print(f"\n\n{function_calling}\n\n") - async def on_agent_started_speaking(self, agent_started_speaking, **kwargs): print(f"\n\n{agent_started_speaking}\n\n") @@ -100,7 +97,6 @@ async def on_unhandled(self, unhandled, **kwargs): AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking ) dg_connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking) - dg_connection.on(AgentWebSocketEvents.FunctionCalling, on_function_calling) dg_connection.on( AgentWebSocketEvents.AgentStartedSpeaking, on_agent_started_speaking ) @@ -110,10 +106,16 @@ async def on_unhandled(self, unhandled, **kwargs): dg_connection.on(AgentWebSocketEvents.Unhandled, on_unhandled) # connect to websocket - options = SettingsConfigurationOptions() + options = SettingsOptions() options.agent.think.provider.type = "open_ai" - options.agent.think.model = "gpt-4o-mini" - options.agent.think.instructions = "You are a helpful AI assistant." + options.agent.think.provider.model = "gpt-4o-mini" + options.agent.think.prompt = "You are a helpful AI assistant." + options.greeting = "Hello, this is a text to speech example using Deepgram." + options.agent.listen.provider.keyterms = ["hello", "goodbye"] + options.agent.listen.provider.model = "nova-3" + options.agent.listen.provider.type = "deepgram" + options.language = "en" + print("\n\nPress Enter to stop...\n\n") if await dg_connection.start(options) is False: diff --git a/examples/agent/no_mic/main.py b/examples/agent/no_mic/main.py new file mode 100644 index 00000000..be638178 --- /dev/null +++ b/examples/agent/no_mic/main.py @@ -0,0 +1,243 @@ +# Copyright 2025 Deepgram SDK contributors. All Rights Reserved. +# Use of this source code is governed by a MIT license that can be found in the LICENSE file. +# SPDX-License-Identifier: MIT + +# Import dependencies and set up the main function +import requests +import wave +import io +import time +import os +import json +import threading +from datetime import datetime + +from deepgram import ( + DeepgramClient, + DeepgramClientOptions, + AgentWebSocketEvents, + AgentKeepAlive, +) +from deepgram.clients.agent.v1.websocket.options import SettingsOptions + +def main(): + try: + # Initialize the Voice Agent + api_key = os.getenv("DEEPGRAM_API_KEY") + if not api_key: + raise ValueError("DEEPGRAM_API_KEY environment variable is not set") + print(f"API Key found:") + + # Initialize Deepgram client + config = DeepgramClientOptions( + options={ + "keepalive": "true", + # "speaker_playback": "true", + }, + ) + deepgram = DeepgramClient(api_key, config) + connection = deepgram.agent.websocket.v("1") + print("Created WebSocket connection...") + + # 4. Configure the Agent + options = SettingsOptions() + # Audio input configuration + options.audio.input.encoding = "linear16" + options.audio.input.sample_rate = 24000 + # Audio output configuration + options.audio.output.encoding = "linear16" + options.audio.output.sample_rate = 24000 + options.audio.output.container = "wav" + # Agent configuration + options.agent.language = "en" + options.agent.listen.provider.type = "deepgram" + options.agent.listen.model = "nova-3" + options.agent.think.provider.type = "open_ai" + options.agent.think.model = "gpt-4o-mini" + options.agent.think.prompt = "You are a friendly AI assistant." + options.agent.speak.provider.type = "deepgram" + options.agent.speak.model = "aura-2-thalia-en" + options.agent.greeting = "Hello! How can I help you today?" + + # Send Keep Alive messages + def send_keep_alive(): + while True: + time.sleep(5) + print("Keep alive!") + connection.send(str(AgentKeepAlive())) + + # Start keep-alive in a separate thread + keep_alive_thread = threading.Thread(target=send_keep_alive, daemon=True) + keep_alive_thread.start() + + # Setup Event Handlers + audio_buffer = bytearray() + file_counter = 0 + processing_complete = False + + def on_audio_data(self, data, **kwargs): + nonlocal audio_buffer + audio_buffer.extend(data) + print(f"Received audio data from agent: {len(data)} bytes") + print(f"Total buffer size: {len(audio_buffer)} bytes") + print(f"Audio data format: {data[:16].hex()}...") + + def on_agent_audio_done(self, agent_audio_done, **kwargs): + nonlocal audio_buffer, file_counter, processing_complete + print(f"AgentAudioDone event received") + print(f"Buffer size at completion: {len(audio_buffer)} bytes") + print(f"Agent audio done: {agent_audio_done}") + if len(audio_buffer) > 0: + with open(f"output-{file_counter}.wav", 'wb') as f: + f.write(create_wav_header()) + f.write(audio_buffer) + print(f"Created output-{file_counter}.wav") + audio_buffer = bytearray() + file_counter += 1 + processing_complete = True + + def on_conversation_text(self, conversation_text, **kwargs): + print(f"Conversation Text: {conversation_text}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"{json.dumps(conversation_text.__dict__)}\n") + + def on_welcome(self, welcome, **kwargs): + print(f"Welcome message received: {welcome}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Welcome message: {welcome}\n") + + def on_settings_applied(self, settings_applied, **kwargs): + print(f"Settings applied: {settings_applied}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Settings applied: {settings_applied}\n") + + def on_user_started_speaking(self, user_started_speaking, **kwargs): + print(f"User Started Speaking: {user_started_speaking}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"User Started Speaking: {user_started_speaking}\n") + + def on_agent_thinking(self, agent_thinking, **kwargs): + print(f"Agent Thinking: {agent_thinking}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Agent Thinking: {agent_thinking}\n") + + def on_agent_started_speaking(self, agent_started_speaking, **kwargs): + nonlocal audio_buffer + audio_buffer = bytearray() # Reset buffer for new response + print(f"Agent Started Speaking: {agent_started_speaking}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Agent Started Speaking: {agent_started_speaking}\n") + + def on_close(self, close, **kwargs): + print(f"Connection closed: {close}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Connection closed: {close}\n") + + def on_error(self, error, **kwargs): + print(f"Error: {error}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Error: {error}\n") + + def on_unhandled(self, unhandled, **kwargs): + print(f"Unhandled event: {unhandled}") + with open("chatlog.txt", 'a') as chatlog: + chatlog.write(f"Unhandled event: {unhandled}\n") + + # Register handlers + connection.on(AgentWebSocketEvents.AudioData, on_audio_data) + connection.on(AgentWebSocketEvents.AgentAudioDone, on_agent_audio_done) + connection.on(AgentWebSocketEvents.ConversationText, on_conversation_text) + connection.on(AgentWebSocketEvents.Welcome, on_welcome) + connection.on(AgentWebSocketEvents.SettingsApplied, on_settings_applied) + connection.on(AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking) + connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking) + connection.on(AgentWebSocketEvents.AgentStartedSpeaking, on_agent_started_speaking) + connection.on(AgentWebSocketEvents.Close, on_close) + connection.on(AgentWebSocketEvents.Error, on_error) + connection.on(AgentWebSocketEvents.Unhandled, on_unhandled) + print("Event handlers registered") + + # Start the connection + print("Starting WebSocket connection...") + if not connection.start(options): + print("Failed to start connection") + return + print("WebSocket connection started successfully") + + # Stream audio + print("Downloading and sending audio...") + response = requests.get("https://dpgr.am/spacewalk.wav", stream=True) + # Skip WAV header + header = response.raw.read(44) + + # Verify WAV header + if header[0:4] != b'RIFF' or header[8:12] != b'WAVE': + print("Invalid WAV header") + return + + # Extract sample rate from header + sample_rate = int.from_bytes(header[24:28], 'little') + + chunk_size = 8192 + total_bytes_sent = 0 + chunk_count = 0 + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + print(f"Sending chunk {chunk_count}: {len(chunk)} bytes") + connection.send(chunk) + total_bytes_sent += len(chunk) + chunk_count += 1 + time.sleep(0.1) # Small delay between chunks + + print(f"Total audio data sent: {total_bytes_sent} bytes in {chunk_count} chunks") + print("Waiting for agent response...") + + # Wait for processing + print("Waiting for processing to complete...") + start_time = time.time() + timeout = 30 # 30 second timeout + + while not processing_complete and (time.time() - start_time) < timeout: + time.sleep(1) + print(f"Still waiting for agent response... ({int(time.time() - start_time)}s elapsed)") + + if not processing_complete: + print("Processing timed out after 30 seconds") + else: + print("Processing complete. Check output-*.wav and chatlog.txt for results.") + + # Cleanup + connection.finish() + print("Finished") + + except Exception as e: + print(f"Error: {str(e)}") + +# WAV Header Functions +def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1): + """Create a WAV header with the specified parameters""" + byte_rate = sample_rate * channels * (bits_per_sample // 8) + block_align = channels * (bits_per_sample // 8) + + header = bytearray(44) + # RIFF header + header[0:4] = b'RIFF' + header[4:8] = b'\x00\x00\x00\x00' # File size (to be updated later) + header[8:12] = b'WAVE' + # fmt chunk + header[12:16] = b'fmt ' + header[16:20] = b'\x10\x00\x00\x00' # Subchunk1Size (16 for PCM) + header[20:22] = b'\x01\x00' # AudioFormat (1 for PCM) + header[22:24] = channels.to_bytes(2, 'little') # NumChannels + header[24:28] = sample_rate.to_bytes(4, 'little') # SampleRate + header[28:32] = byte_rate.to_bytes(4, 'little') # ByteRate + header[32:34] = block_align.to_bytes(2, 'little') # BlockAlign + header[34:36] = bits_per_sample.to_bytes(2, 'little') # BitsPerSample + # data chunk + header[36:40] = b'data' + header[40:44] = b'\x00\x00\x00\x00' # Subchunk2Size (to be updated later) + + return header + +if __name__ == "__main__": + main() diff --git a/examples/agent/simple/main.py b/examples/agent/simple/main.py index d2d875c3..c5898702 100644 --- a/examples/agent/simple/main.py +++ b/examples/agent/simple/main.py @@ -8,8 +8,7 @@ DeepgramClient, DeepgramClientOptions, AgentWebSocketEvents, - SettingsConfigurationOptions, - FunctionCalling, + SettingsOptions, FunctionCallRequest, FunctionCallResponse, ) @@ -82,9 +81,6 @@ def on_user_started_speaking(self, user_started_speaking, **kwargs): def on_agent_thinking(self, agent_thinking, **kwargs): print(f"\n\n{agent_thinking}\n\n") - def on_function_calling(self, function_calling: FunctionCalling, **kwargs): - print(f"\n\nFunction Calling Debug: {function_calling}\n\n") - def on_function_call_request( self, function_call_request: FunctionCallRequest, **kwargs ): @@ -122,7 +118,6 @@ def on_unhandled(self, unhandled, **kwargs): AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking ) dg_connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking) - dg_connection.on(AgentWebSocketEvents.FunctionCalling, on_function_calling) dg_connection.on( AgentWebSocketEvents.FunctionCallRequest, on_function_call_request ) @@ -135,13 +130,15 @@ def on_unhandled(self, unhandled, **kwargs): dg_connection.on(AgentWebSocketEvents.Unhandled, on_unhandled) # connect to websocket - options: SettingsConfigurationOptions = SettingsConfigurationOptions() - options.agent.listen.model = "nova-3" - options.agent.listen.keyterms = ["hello", "goodbye"] + options: SettingsOptions = SettingsOptions() options.agent.think.provider.type = "open_ai" - options.agent.think.model = "gpt-4o-mini" - options.agent.think.instructions = "You are a helpful AI assistant." - + options.agent.think.provider.model = "gpt-4o-mini" + options.agent.think.prompt = "You are a helpful AI assistant." + options.greeting = "Hello, this is a text to speech example using Deepgram." + options.agent.listen.provider.keyterms = ["hello", "goodbye"] + options.agent.listen.provider.model = "nova-3" + options.agent.listen.provider.type = "deepgram" + options.language = "en" if dg_connection.start(options) is False: print("Failed to start connection") return diff --git a/examples/requirements-examples.txt b/examples/requirements-examples.txt index 8b772e0a..309cd944 100644 --- a/examples/requirements-examples.txt +++ b/examples/requirements-examples.txt @@ -2,6 +2,7 @@ # general python-dotenv +requests # streaming libs pyaudio