feat: add audio speech generation endpoint and media handling refactor

- Added new `/v1/audio/speech` and `/api/{path_provider}/audio/speech` endpoints in `g4f/api/__init__.py` for generating speech from text
- Introduced `AudioSpeechConfig` model in `g4f/api/stubs.py` with fields for input, model, provider, voice, instructions, and response format
- Updated `PollinationsAI.py` to support `modalities` in `kwargs` when checking for audio
- Set default voice for audio models in `PollinationsAI.py` if not provided in `kwargs`
- Added debug print in `PollinationsAI.py` to log request data to text API endpoint
- Extended supported FastAPI response types in `g4f/api/__init__.py` to include `FileResponse` from `starlette.responses`
- Added `BackgroundTask` to clean up generated audio files after serving in `g4f/api/__init__.py`
- Modified `AnyProvider.py` to include `EdgeTTS`, `gTTS`, and `MarkItDown` as audio providers when `audio` is in `kwargs` or `modalities`
- Created `resolve_media` helper in `g4f/client/__init__.py` to standardize media handling for audio/image input
- Replaced manual media preprocessing in `Completions`, `AsyncCompletions`, and `Images` classes with `resolve_media`
- Added `/docs/README.md` with a link to the documentation site
This commit is contained in:
hlohaus 2025-04-26 12:21:49 +02:00
parent b15a83ae13
commit c3632984f7
6 changed files with 90 additions and 29 deletions

1
docs/README.md Normal file
View file

@ -0,0 +1 @@
[Documentation](https://gpt4free.github.io/docs/main.html)

View file

@ -177,7 +177,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
# Load model list # Load model list
cls.get_models() cls.get_models()
if not model: if not model:
has_audio = "audio" in kwargs has_audio = "audio" in kwargs or "audio" in kwargs.get("modalities", [])
if not has_audio and media is not None: if not has_audio and media is not None:
for media_data, filename in media: for media_data, filename in media:
if is_data_an_audio(media_data, filename): if is_data_an_audio(media_data, filename):
@ -311,6 +311,8 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session: async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session:
if model in cls.audio_models: if model in cls.audio_models:
if "audio" in kwargs and kwargs.get("audio", {}).get("voice") is None:
kwargs["audio"]["voice"] = cls.audio_models[model][0]
url = cls.text_api_endpoint url = cls.text_api_endpoint
stream = False stream = False
else: else:
@ -329,6 +331,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
"cache": cache, "cache": cache,
**extra_parameters **extra_parameters
}) })
print(f"Requesting {url} with data: {data}")
async with session.post(url, json=data) as response: async with session.post(url, json=data) as response:
await raise_for_status(response) await raise_for_status(response)
if response.headers["content-type"].startswith("text/plain"): if response.headers["content-type"].startswith("text/plain"):

View file

@ -13,7 +13,7 @@ import asyncio
from urllib.parse import quote_plus from urllib.parse import quote_plus
from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends
from fastapi.middleware.wsgi import WSGIMiddleware from fastapi.middleware.wsgi import WSGIMiddleware
from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse, FileResponse
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.security import APIKeyHeader from fastapi.security import APIKeyHeader
from starlette.exceptions import HTTPException from starlette.exceptions import HTTPException
@ -30,6 +30,7 @@ from fastapi.encoders import jsonable_encoder
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from starlette.responses import FileResponse from starlette.responses import FileResponse
from starlette.background import BackgroundTask
from types import SimpleNamespace from types import SimpleNamespace
from typing import Union, Optional, List from typing import Union, Optional, List
@ -49,6 +50,7 @@ from g4f.image.copy_images import get_media_dir, copy_media, get_source_url
from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError
from g4f.cookies import read_cookie_files, get_cookies_dir from g4f.cookies import read_cookie_files, get_cookies_dir
from g4f.providers.types import ProviderType from g4f.providers.types import ProviderType
from g4f.providers.response import AudioResponse
from g4f.providers.any_provider import AnyProvider from g4f.providers.any_provider import AnyProvider
from g4f import Provider from g4f import Provider
from g4f.gui import get_gui_app from g4f.gui import get_gui_app
@ -58,7 +60,7 @@ from .stubs import (
ProviderResponseModel, ModelResponseModel, ProviderResponseModel, ModelResponseModel,
ErrorResponseModel, ProviderResponseDetailModel, ErrorResponseModel, ProviderResponseDetailModel,
FileResponseModel, UploadResponseModel, FileResponseModel, UploadResponseModel,
TranscriptionResponseModel TranscriptionResponseModel, AudioSpeechConfig
) )
from g4f import debug from g4f import debug
@ -492,10 +494,11 @@ class Api:
} }
@self.app.post("/v1/audio/transcriptions", responses=responses) @self.app.post("/v1/audio/transcriptions", responses=responses)
@self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses) @self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses)
async def generate_image( @self.app.post("/api/markitdown", responses=responses)
async def convert(
file: UploadFile, file: UploadFile,
model: Annotated[Optional[str], Form()] = None, model: Annotated[Optional[str], Form()] = None,
provider: Annotated[Optional[str], Form()] = AppConfig.media_provider, provider: Annotated[Optional[str], Form()] = "MarkItDown",
path_provider: str = None, path_provider: str = None,
prompt: Annotated[Optional[str], Form()] = "Transcribe this audio", prompt: Annotated[Optional[str], Form()] = "Transcribe this audio",
api_key: Annotated[Optional[str], Form()] = None, api_key: Annotated[Optional[str], Form()] = None,
@ -525,6 +528,54 @@ class Api:
logger.exception(e) logger.exception(e)
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR) return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
responses = {
HTTP_200_OK: {"class": FileResponse},
HTTP_401_UNAUTHORIZED: {"model": ErrorResponseModel},
HTTP_404_NOT_FOUND: {"model": ErrorResponseModel},
HTTP_500_INTERNAL_SERVER_ERROR: {"model": ErrorResponseModel},
}
@self.app.post("/v1/audio/speech", responses=responses)
@self.app.post("/api/{path_provider}/audio/speech", responses=responses)
async def generate_speech(
config: AudioSpeechConfig,
provider: str = AppConfig.media_provider,
credentials: Annotated[HTTPAuthorizationCredentials, Depends(Api.security)] = None
):
api_key = None
if credentials is not None and credentials.credentials != "secret":
api_key = credentials.credentials
try:
response = await self.client.chat.completions.create(
messages=[
{"role": "user", "content": f"{config.instrcutions} Text: {config.input}"}
],
model=config.model,
provider=config.provider if provider is None else provider,
prompt=config.input,
audio=filter_none(voice=config.voice, format=config.response_format),
**filter_none(
api_key=api_key,
)
)
if isinstance(response.choices[0].message.content, AudioResponse):
response = response.choices[0].message.content.data
response = response.replace("/media", get_media_dir())
def delete_file():
try:
os.remove(response)
except Exception as e:
logger.exception(e)
return FileResponse(response, background=BackgroundTask(delete_file))
except (ModelNotFoundError, ProviderNotFoundError) as e:
logger.exception(e)
return ErrorResponse.from_exception(e, None, HTTP_404_NOT_FOUND)
except MissingAuthError as e:
logger.exception(e)
return ErrorResponse.from_exception(e, None, HTTP_401_UNAUTHORIZED)
except Exception as e:
logger.exception(e)
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
@self.app.post("/v1/upload_cookies", responses={ @self.app.post("/v1/upload_cookies", responses={
HTTP_200_OK: {"model": List[FileResponseModel]}, HTTP_200_OK: {"model": List[FileResponseModel]},
}) })

View file

@ -118,4 +118,12 @@ class FileResponseModel(BaseModel):
class TranscriptionResponseModel(BaseModel): class TranscriptionResponseModel(BaseModel):
text: str text: str
model: str model: str
provider: str provider: str
class AudioSpeechConfig(BaseModel):
input: str
model: Optional[str] = None
provider: Optional[str] = None
voice: Optional[str] = None
instrcutions: str = "Speech this text in a natural way."
response_format: Optional[str] = None

View file

@ -45,6 +45,17 @@ def add_chunk(content, chunk):
content = str(content) + str(chunk) content = str(content) + str(chunk)
return content return content
def resolve_media(kwargs: dict, image = None, image_name: str = None) -> None:
if image is not None:
kwargs["media"] = [(image, image_name)]
elif "images" in kwargs:
kwargs["media"] = kwargs.pop("images")
if "media" in kwargs and not isinstance(kwargs["media"], list):
kwargs["media"] = [kwargs["media"]]
for idx, media in enumerate(kwargs.get("media", [])):
if not isinstance(media, (list, tuple)):
kwargs["media"][idx] = (media, getattr(media, "name", None))
# Synchronous iter_response function # Synchronous iter_response function
def iter_response( def iter_response(
response: Union[Iterator[Union[str, ResponseType]]], response: Union[Iterator[Union[str, ResponseType]]],
@ -296,13 +307,7 @@ class Completions:
) -> ChatCompletion: ) -> ChatCompletion:
if isinstance(messages, str): if isinstance(messages, str):
messages = [{"role": "user", "content": messages}] messages = [{"role": "user", "content": messages}]
if image is not None: resolve_media(kwargs, image, image_name)
kwargs["media"] = [(image, image_name)]
elif "images" in kwargs:
kwargs["media"] = kwargs.pop("images")
for idx, media in enumerate(kwargs.get("media", [])):
if not isinstance(media, (list, tuple)):
kwargs["media"][idx] = (media, getattr(media, "name", None))
if provider is None: if provider is None:
provider = self.provider provider = self.provider
if provider is None: if provider is None:
@ -483,6 +488,7 @@ class Images:
async def async_create_variation( async def async_create_variation(
self, self,
image: ImageType, image: ImageType,
image_name: str = None,
model: Optional[str] = None, model: Optional[str] = None,
provider: Optional[ProviderType] = None, provider: Optional[ProviderType] = None,
response_format: Optional[str] = None, response_format: Optional[str] = None,
@ -494,11 +500,7 @@ class Images:
if proxy is None: if proxy is None:
proxy = self.client.proxy proxy = self.client.proxy
prompt = "create a variation of this image" prompt = "create a variation of this image"
if image is not None: resolve_media(kwargs, image, image_name)
kwargs["media"] = image
for idx, media in enumerate(kwargs.get("media", [])):
if not isinstance(media, (list, tuple)):
kwargs["media"][idx] = (media, getattr(media, "name", None))
error = None error = None
response = None response = None
if isinstance(provider_handler, IterListProvider): if isinstance(provider_handler, IterListProvider):
@ -600,13 +602,7 @@ class AsyncCompletions:
) -> Awaitable[ChatCompletion]: ) -> Awaitable[ChatCompletion]:
if isinstance(messages, str): if isinstance(messages, str):
messages = [{"role": "user", "content": messages}] messages = [{"role": "user", "content": messages}]
if image is not None: resolve_media(kwargs, image, image_name)
kwargs["media"] = [(image, image_name)]
elif "images" in kwargs:
kwargs["media"] = kwargs.pop("images")
for idx, media in enumerate(kwargs.get("media", [])):
if not isinstance(media, (list, tuple)):
kwargs["media"][idx] = (media, getattr(media, "name", None))
if provider is None: if provider is None:
provider = self.provider provider = self.provider
if provider is None: if provider is None:

View file

@ -11,7 +11,7 @@ from ..Provider.hf_space import HuggingSpace
from .. import Provider from .. import Provider
from .. import models from .. import models
from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter
from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox, EdgeTTS, gTTS, MarkItDown
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin): class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
@ -124,15 +124,17 @@ class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
elif not model or model == cls.default_model: elif not model or model == cls.default_model:
model = "" model = ""
has_image = False has_image = False
has_audio = "audio" in kwargs has_audio = False
if not has_audio and media is not None: if not has_audio and media is not None:
for media_data, filename in media: for media_data, filename in media:
if is_data_an_audio(media_data, filename): if is_data_an_audio(media_data, filename):
has_audio = True has_audio = True
break break
has_image = True has_image = True
if has_audio: if "audio" in kwargs or "audio" in kwargs.get("modalities", []):
providers = [PollinationsAI, Microsoft_Phi_4] providers = [PollinationsAI, EdgeTTS, gTTS]
elif has_audio:
providers = [PollinationsAI, Microsoft_Phi_4, MarkItDown]
elif has_image: elif has_image:
providers = models.default_vision.best_provider.providers providers = models.default_vision.best_provider.providers
else: else: