From 705ad029545e0cd90aa1c9fc40c216766fe2ecd1 Mon Sep 17 00:00:00 2001 From: hlohaus <983577+hlohaus@users.noreply.github.com> Date: Fri, 21 Mar 2025 05:13:59 +0100 Subject: [PATCH] Add audio example usage --- etc/examples/audio.py | 28 ++++++++++++++++++++++++++++ g4f/Provider/PollinationsAI.py | 2 +- g4f/api/stubs.py | 1 + g4f/client/stubs.py | 16 ++++++++++++++-- g4f/image/__init__.py | 4 ++-- 5 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 etc/examples/audio.py diff --git a/etc/examples/audio.py b/etc/examples/audio.py new file mode 100644 index 00000000..851e939d --- /dev/null +++ b/etc/examples/audio.py @@ -0,0 +1,28 @@ +import asyncio +from g4f.client import AsyncClient +import g4f.Provider +import g4f.models + +async def main(): + client = AsyncClient(provider=g4f.Provider.PollinationsAI) + + # Generate audio with PollinationsAI + response = await client.chat.completions.create( + model="openai-audio", + messages=[{"role": "user", "content": "Say good day to the world"}], + audio={ "voice": "alloy", "format": "mp3" }, + ) + response.choices[0].message.save("alloy.mp3") + + # Transcribe a audio file + with open("audio.wav", "rb") as audio_file: + response = await client.chat.completions.create( + messages="Transcribe this audio", + provider=g4f.Provider.Microsoft_Phi_4, + media=[[audio_file, "audio.wav"]], + modalities=["text"], + ) + print(response.choices[0].message.content) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/g4f/Provider/PollinationsAI.py b/g4f/Provider/PollinationsAI.py index 38d9adaf..c064df1e 100644 --- a/g4f/Provider/PollinationsAI.py +++ b/g4f/Provider/PollinationsAI.py @@ -152,7 +152,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin): top_p: float = 1, frequency_penalty: float = None, response_format: Optional[dict] = None, - extra_parameters: list[str] = ["tools", "parallel_tool_calls", "tool_choice", "reasoning_effort", "logit_bias", "voice", "modalities"], + extra_parameters: list[str] = ["tools", "parallel_tool_calls", "tool_choice", "reasoning_effort", "logit_bias", "voice", "modalities", "audio"], **kwargs ) -> AsyncResult: # Load model list diff --git a/g4f/api/stubs.py b/g4f/api/stubs.py index 086aff79..c586c7c5 100644 --- a/g4f/api/stubs.py +++ b/g4f/api/stubs.py @@ -18,6 +18,7 @@ class ChatCompletionsConfig(BaseModel): image_name: Optional[str] = None images: Optional[list[tuple[str, str]]] = None media: Optional[list[tuple[str, str]]] = None + modalities: Optional[list[str]] = ["text", "audio"] temperature: Optional[float] = None presence_penalty: Optional[float] = None frequency_penalty: Optional[float] = None diff --git a/g4f/client/stubs.py b/g4f/client/stubs.py index 2ffdd31c..da1329ff 100644 --- a/g4f/client/stubs.py +++ b/g4f/client/stubs.py @@ -1,8 +1,10 @@ from __future__ import annotations -from typing import Optional, List, Dict, Any +from typing import Optional, List from time import time +from ..image import extract_data_uri +from ..client.helper import filter_markdown from .helper import filter_none try: @@ -103,6 +105,16 @@ class ChatCompletionMessage(BaseModel): def model_construct(cls, content: str, tool_calls: list = None): return super().model_construct(role="assistant", content=content, **filter_none(tool_calls=tool_calls)) + def save(self, filepath: str, allowd_types = None): + if self.content.startswith("data:"): + with open(filepath, "wb") as f: + f.write(extract_data_uri(self.content)) + return + content = filter_markdown(self.content, allowd_types) + if content is not None: + with open(filepath, "w") as f: + f.write(content) + class ChatCompletionChoice(BaseModel): index: int message: ChatCompletionMessage @@ -118,7 +130,7 @@ class ChatCompletion(BaseModel): created: int model: str provider: Optional[str] - choices: List[ChatCompletionChoice] + choices: list[ChatCompletionChoice] usage: UsageModel @classmethod diff --git a/g4f/image/__init__.py b/g4f/image/__init__.py index 934989c5..0ac37737 100644 --- a/g4f/image/__init__.py +++ b/g4f/image/__init__.py @@ -248,14 +248,14 @@ def to_input_audio(audio: ImageType, filename: str = None) -> str: if filename is not None and (filename.endswith(".wav") or filename.endswith(".mp3")): return { "data": base64.b64encode(to_bytes(audio)).decode(), - "format": "wav" if filename.endswith(".wav") else "mpeg" + "format": "wav" if filename.endswith(".wav") else "mp3" } raise ValueError("Invalid input audio") audio = re.match(r'^data:audio/(\w+);base64,(.+?)', audio) if audio: return { "data": audio.group(2), - "format": audio.group(1), + "format": audio.group(1).replace("mpeg", "mp3") } raise ValueError("Invalid input audio")