mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-06 02:30:41 -08:00
feat: add audio speech generation endpoint and media handling refactor
- Added new `/v1/audio/speech` and `/api/{path_provider}/audio/speech` endpoints in `g4f/api/__init__.py` for generating speech from text
- Introduced `AudioSpeechConfig` model in `g4f/api/stubs.py` with fields for input, model, provider, voice, instructions, and response format
- Updated `PollinationsAI.py` to support `modalities` in `kwargs` when checking for audio
- Set default voice for audio models in `PollinationsAI.py` if not provided in `kwargs`
- Added debug print in `PollinationsAI.py` to log request data to text API endpoint
- Extended supported FastAPI response types in `g4f/api/__init__.py` to include `FileResponse` from `starlette.responses`
- Added `BackgroundTask` to clean up generated audio files after serving in `g4f/api/__init__.py`
- Modified `AnyProvider.py` to include `EdgeTTS`, `gTTS`, and `MarkItDown` as audio providers when `audio` is in `kwargs` or `modalities`
- Created `resolve_media` helper in `g4f/client/__init__.py` to standardize media handling for audio/image input
- Replaced manual media preprocessing in `Completions`, `AsyncCompletions`, and `Images` classes with `resolve_media`
- Added `/docs/README.md` with a link to the documentation site
This commit is contained in:
parent
b15a83ae13
commit
c3632984f7
6 changed files with 90 additions and 29 deletions
1
docs/README.md
Normal file
1
docs/README.md
Normal file
|
|
@ -0,0 +1 @@
|
|||
[Documentation](https://gpt4free.github.io/docs/main.html)
|
||||
|
|
@ -177,7 +177,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
|||
# Load model list
|
||||
cls.get_models()
|
||||
if not model:
|
||||
has_audio = "audio" in kwargs
|
||||
has_audio = "audio" in kwargs or "audio" in kwargs.get("modalities", [])
|
||||
if not has_audio and media is not None:
|
||||
for media_data, filename in media:
|
||||
if is_data_an_audio(media_data, filename):
|
||||
|
|
@ -311,6 +311,8 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
|||
|
||||
async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session:
|
||||
if model in cls.audio_models:
|
||||
if "audio" in kwargs and kwargs.get("audio", {}).get("voice") is None:
|
||||
kwargs["audio"]["voice"] = cls.audio_models[model][0]
|
||||
url = cls.text_api_endpoint
|
||||
stream = False
|
||||
else:
|
||||
|
|
@ -329,6 +331,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
|||
"cache": cache,
|
||||
**extra_parameters
|
||||
})
|
||||
print(f"Requesting {url} with data: {data}")
|
||||
async with session.post(url, json=data) as response:
|
||||
await raise_for_status(response)
|
||||
if response.headers["content-type"].startswith("text/plain"):
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ import asyncio
|
|||
from urllib.parse import quote_plus
|
||||
from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends
|
||||
from fastapi.middleware.wsgi import WSGIMiddleware
|
||||
from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse
|
||||
from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse, FileResponse
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.security import APIKeyHeader
|
||||
from starlette.exceptions import HTTPException
|
||||
|
|
@ -30,6 +30,7 @@ from fastapi.encoders import jsonable_encoder
|
|||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.responses import FileResponse
|
||||
from starlette.background import BackgroundTask
|
||||
from types import SimpleNamespace
|
||||
from typing import Union, Optional, List
|
||||
|
||||
|
|
@ -49,6 +50,7 @@ from g4f.image.copy_images import get_media_dir, copy_media, get_source_url
|
|||
from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError
|
||||
from g4f.cookies import read_cookie_files, get_cookies_dir
|
||||
from g4f.providers.types import ProviderType
|
||||
from g4f.providers.response import AudioResponse
|
||||
from g4f.providers.any_provider import AnyProvider
|
||||
from g4f import Provider
|
||||
from g4f.gui import get_gui_app
|
||||
|
|
@ -58,7 +60,7 @@ from .stubs import (
|
|||
ProviderResponseModel, ModelResponseModel,
|
||||
ErrorResponseModel, ProviderResponseDetailModel,
|
||||
FileResponseModel, UploadResponseModel,
|
||||
TranscriptionResponseModel
|
||||
TranscriptionResponseModel, AudioSpeechConfig
|
||||
)
|
||||
from g4f import debug
|
||||
|
||||
|
|
@ -492,10 +494,11 @@ class Api:
|
|||
}
|
||||
@self.app.post("/v1/audio/transcriptions", responses=responses)
|
||||
@self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses)
|
||||
async def generate_image(
|
||||
@self.app.post("/api/markitdown", responses=responses)
|
||||
async def convert(
|
||||
file: UploadFile,
|
||||
model: Annotated[Optional[str], Form()] = None,
|
||||
provider: Annotated[Optional[str], Form()] = AppConfig.media_provider,
|
||||
provider: Annotated[Optional[str], Form()] = "MarkItDown",
|
||||
path_provider: str = None,
|
||||
prompt: Annotated[Optional[str], Form()] = "Transcribe this audio",
|
||||
api_key: Annotated[Optional[str], Form()] = None,
|
||||
|
|
@ -525,6 +528,54 @@ class Api:
|
|||
logger.exception(e)
|
||||
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
responses = {
|
||||
HTTP_200_OK: {"class": FileResponse},
|
||||
HTTP_401_UNAUTHORIZED: {"model": ErrorResponseModel},
|
||||
HTTP_404_NOT_FOUND: {"model": ErrorResponseModel},
|
||||
HTTP_500_INTERNAL_SERVER_ERROR: {"model": ErrorResponseModel},
|
||||
}
|
||||
@self.app.post("/v1/audio/speech", responses=responses)
|
||||
@self.app.post("/api/{path_provider}/audio/speech", responses=responses)
|
||||
async def generate_speech(
|
||||
config: AudioSpeechConfig,
|
||||
provider: str = AppConfig.media_provider,
|
||||
credentials: Annotated[HTTPAuthorizationCredentials, Depends(Api.security)] = None
|
||||
):
|
||||
api_key = None
|
||||
if credentials is not None and credentials.credentials != "secret":
|
||||
api_key = credentials.credentials
|
||||
try:
|
||||
response = await self.client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "user", "content": f"{config.instrcutions} Text: {config.input}"}
|
||||
],
|
||||
model=config.model,
|
||||
provider=config.provider if provider is None else provider,
|
||||
prompt=config.input,
|
||||
audio=filter_none(voice=config.voice, format=config.response_format),
|
||||
**filter_none(
|
||||
api_key=api_key,
|
||||
)
|
||||
)
|
||||
if isinstance(response.choices[0].message.content, AudioResponse):
|
||||
response = response.choices[0].message.content.data
|
||||
response = response.replace("/media", get_media_dir())
|
||||
def delete_file():
|
||||
try:
|
||||
os.remove(response)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return FileResponse(response, background=BackgroundTask(delete_file))
|
||||
except (ModelNotFoundError, ProviderNotFoundError) as e:
|
||||
logger.exception(e)
|
||||
return ErrorResponse.from_exception(e, None, HTTP_404_NOT_FOUND)
|
||||
except MissingAuthError as e:
|
||||
logger.exception(e)
|
||||
return ErrorResponse.from_exception(e, None, HTTP_401_UNAUTHORIZED)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
@self.app.post("/v1/upload_cookies", responses={
|
||||
HTTP_200_OK: {"model": List[FileResponseModel]},
|
||||
})
|
||||
|
|
|
|||
|
|
@ -118,4 +118,12 @@ class FileResponseModel(BaseModel):
|
|||
class TranscriptionResponseModel(BaseModel):
|
||||
text: str
|
||||
model: str
|
||||
provider: str
|
||||
provider: str
|
||||
|
||||
class AudioSpeechConfig(BaseModel):
|
||||
input: str
|
||||
model: Optional[str] = None
|
||||
provider: Optional[str] = None
|
||||
voice: Optional[str] = None
|
||||
instrcutions: str = "Speech this text in a natural way."
|
||||
response_format: Optional[str] = None
|
||||
|
|
@ -45,6 +45,17 @@ def add_chunk(content, chunk):
|
|||
content = str(content) + str(chunk)
|
||||
return content
|
||||
|
||||
def resolve_media(kwargs: dict, image = None, image_name: str = None) -> None:
|
||||
if image is not None:
|
||||
kwargs["media"] = [(image, image_name)]
|
||||
elif "images" in kwargs:
|
||||
kwargs["media"] = kwargs.pop("images")
|
||||
if "media" in kwargs and not isinstance(kwargs["media"], list):
|
||||
kwargs["media"] = [kwargs["media"]]
|
||||
for idx, media in enumerate(kwargs.get("media", [])):
|
||||
if not isinstance(media, (list, tuple)):
|
||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
||||
|
||||
# Synchronous iter_response function
|
||||
def iter_response(
|
||||
response: Union[Iterator[Union[str, ResponseType]]],
|
||||
|
|
@ -296,13 +307,7 @@ class Completions:
|
|||
) -> ChatCompletion:
|
||||
if isinstance(messages, str):
|
||||
messages = [{"role": "user", "content": messages}]
|
||||
if image is not None:
|
||||
kwargs["media"] = [(image, image_name)]
|
||||
elif "images" in kwargs:
|
||||
kwargs["media"] = kwargs.pop("images")
|
||||
for idx, media in enumerate(kwargs.get("media", [])):
|
||||
if not isinstance(media, (list, tuple)):
|
||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
||||
resolve_media(kwargs, image, image_name)
|
||||
if provider is None:
|
||||
provider = self.provider
|
||||
if provider is None:
|
||||
|
|
@ -483,6 +488,7 @@ class Images:
|
|||
async def async_create_variation(
|
||||
self,
|
||||
image: ImageType,
|
||||
image_name: str = None,
|
||||
model: Optional[str] = None,
|
||||
provider: Optional[ProviderType] = None,
|
||||
response_format: Optional[str] = None,
|
||||
|
|
@ -494,11 +500,7 @@ class Images:
|
|||
if proxy is None:
|
||||
proxy = self.client.proxy
|
||||
prompt = "create a variation of this image"
|
||||
if image is not None:
|
||||
kwargs["media"] = image
|
||||
for idx, media in enumerate(kwargs.get("media", [])):
|
||||
if not isinstance(media, (list, tuple)):
|
||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
||||
resolve_media(kwargs, image, image_name)
|
||||
error = None
|
||||
response = None
|
||||
if isinstance(provider_handler, IterListProvider):
|
||||
|
|
@ -600,13 +602,7 @@ class AsyncCompletions:
|
|||
) -> Awaitable[ChatCompletion]:
|
||||
if isinstance(messages, str):
|
||||
messages = [{"role": "user", "content": messages}]
|
||||
if image is not None:
|
||||
kwargs["media"] = [(image, image_name)]
|
||||
elif "images" in kwargs:
|
||||
kwargs["media"] = kwargs.pop("images")
|
||||
for idx, media in enumerate(kwargs.get("media", [])):
|
||||
if not isinstance(media, (list, tuple)):
|
||||
kwargs["media"][idx] = (media, getattr(media, "name", None))
|
||||
resolve_media(kwargs, image, image_name)
|
||||
if provider is None:
|
||||
provider = self.provider
|
||||
if provider is None:
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from ..Provider.hf_space import HuggingSpace
|
|||
from .. import Provider
|
||||
from .. import models
|
||||
from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter
|
||||
from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox
|
||||
from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox, EdgeTTS, gTTS, MarkItDown
|
||||
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
|
||||
|
||||
class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
|
|
@ -124,15 +124,17 @@ class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin):
|
|||
elif not model or model == cls.default_model:
|
||||
model = ""
|
||||
has_image = False
|
||||
has_audio = "audio" in kwargs
|
||||
has_audio = False
|
||||
if not has_audio and media is not None:
|
||||
for media_data, filename in media:
|
||||
if is_data_an_audio(media_data, filename):
|
||||
has_audio = True
|
||||
break
|
||||
has_image = True
|
||||
if has_audio:
|
||||
providers = [PollinationsAI, Microsoft_Phi_4]
|
||||
if "audio" in kwargs or "audio" in kwargs.get("modalities", []):
|
||||
providers = [PollinationsAI, EdgeTTS, gTTS]
|
||||
elif has_audio:
|
||||
providers = [PollinationsAI, Microsoft_Phi_4, MarkItDown]
|
||||
elif has_image:
|
||||
providers = models.default_vision.best_provider.providers
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue