diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..6a9656e7 --- /dev/null +++ b/docs/README.md @@ -0,0 +1 @@ +[Documentation](https://gpt4free.github.io/docs/main.html) \ No newline at end of file diff --git a/g4f/Provider/PollinationsAI.py b/g4f/Provider/PollinationsAI.py index 0584b54d..8441a12a 100644 --- a/g4f/Provider/PollinationsAI.py +++ b/g4f/Provider/PollinationsAI.py @@ -177,7 +177,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin): # Load model list cls.get_models() if not model: - has_audio = "audio" in kwargs + has_audio = "audio" in kwargs or "audio" in kwargs.get("modalities", []) if not has_audio and media is not None: for media_data, filename in media: if is_data_an_audio(media_data, filename): @@ -311,6 +311,8 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin): async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session: if model in cls.audio_models: + if "audio" in kwargs and kwargs.get("audio", {}).get("voice") is None: + kwargs["audio"]["voice"] = cls.audio_models[model][0] url = cls.text_api_endpoint stream = False else: @@ -329,6 +331,7 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin): "cache": cache, **extra_parameters }) + print(f"Requesting {url} with data: {data}") async with session.post(url, json=data) as response: await raise_for_status(response) if response.headers["content-type"].startswith("text/plain"): diff --git a/g4f/api/__init__.py b/g4f/api/__init__.py index 4ef59e92..0f9de28d 100644 --- a/g4f/api/__init__.py +++ b/g4f/api/__init__.py @@ -13,7 +13,7 @@ import asyncio from urllib.parse import quote_plus from fastapi import FastAPI, Response, Request, UploadFile, Form, Depends from fastapi.middleware.wsgi import WSGIMiddleware -from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse +from fastapi.responses import StreamingResponse, RedirectResponse, HTMLResponse, JSONResponse, FileResponse from fastapi.exceptions import RequestValidationError from fastapi.security import APIKeyHeader from starlette.exceptions import HTTPException @@ -30,6 +30,7 @@ from fastapi.encoders import jsonable_encoder from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials, HTTPBasic from fastapi.middleware.cors import CORSMiddleware from starlette.responses import FileResponse +from starlette.background import BackgroundTask from types import SimpleNamespace from typing import Union, Optional, List @@ -49,6 +50,7 @@ from g4f.image.copy_images import get_media_dir, copy_media, get_source_url from g4f.errors import ProviderNotFoundError, ModelNotFoundError, MissingAuthError, NoValidHarFileError from g4f.cookies import read_cookie_files, get_cookies_dir from g4f.providers.types import ProviderType +from g4f.providers.response import AudioResponse from g4f.providers.any_provider import AnyProvider from g4f import Provider from g4f.gui import get_gui_app @@ -58,7 +60,7 @@ from .stubs import ( ProviderResponseModel, ModelResponseModel, ErrorResponseModel, ProviderResponseDetailModel, FileResponseModel, UploadResponseModel, - TranscriptionResponseModel + TranscriptionResponseModel, AudioSpeechConfig ) from g4f import debug @@ -492,10 +494,11 @@ class Api: } @self.app.post("/v1/audio/transcriptions", responses=responses) @self.app.post("/api/{path_provider}/audio/transcriptions", responses=responses) - async def generate_image( + @self.app.post("/api/markitdown", responses=responses) + async def convert( file: UploadFile, model: Annotated[Optional[str], Form()] = None, - provider: Annotated[Optional[str], Form()] = AppConfig.media_provider, + provider: Annotated[Optional[str], Form()] = "MarkItDown", path_provider: str = None, prompt: Annotated[Optional[str], Form()] = "Transcribe this audio", api_key: Annotated[Optional[str], Form()] = None, @@ -525,6 +528,54 @@ class Api: logger.exception(e) return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR) + responses = { + HTTP_200_OK: {"class": FileResponse}, + HTTP_401_UNAUTHORIZED: {"model": ErrorResponseModel}, + HTTP_404_NOT_FOUND: {"model": ErrorResponseModel}, + HTTP_500_INTERNAL_SERVER_ERROR: {"model": ErrorResponseModel}, + } + @self.app.post("/v1/audio/speech", responses=responses) + @self.app.post("/api/{path_provider}/audio/speech", responses=responses) + async def generate_speech( + config: AudioSpeechConfig, + provider: str = AppConfig.media_provider, + credentials: Annotated[HTTPAuthorizationCredentials, Depends(Api.security)] = None + ): + api_key = None + if credentials is not None and credentials.credentials != "secret": + api_key = credentials.credentials + try: + response = await self.client.chat.completions.create( + messages=[ + {"role": "user", "content": f"{config.instrcutions} Text: {config.input}"} + ], + model=config.model, + provider=config.provider if provider is None else provider, + prompt=config.input, + audio=filter_none(voice=config.voice, format=config.response_format), + **filter_none( + api_key=api_key, + ) + ) + if isinstance(response.choices[0].message.content, AudioResponse): + response = response.choices[0].message.content.data + response = response.replace("/media", get_media_dir()) + def delete_file(): + try: + os.remove(response) + except Exception as e: + logger.exception(e) + return FileResponse(response, background=BackgroundTask(delete_file)) + except (ModelNotFoundError, ProviderNotFoundError) as e: + logger.exception(e) + return ErrorResponse.from_exception(e, None, HTTP_404_NOT_FOUND) + except MissingAuthError as e: + logger.exception(e) + return ErrorResponse.from_exception(e, None, HTTP_401_UNAUTHORIZED) + except Exception as e: + logger.exception(e) + return ErrorResponse.from_exception(e, None, HTTP_500_INTERNAL_SERVER_ERROR) + @self.app.post("/v1/upload_cookies", responses={ HTTP_200_OK: {"model": List[FileResponseModel]}, }) diff --git a/g4f/api/stubs.py b/g4f/api/stubs.py index 25dfe1e9..914361bc 100644 --- a/g4f/api/stubs.py +++ b/g4f/api/stubs.py @@ -118,4 +118,12 @@ class FileResponseModel(BaseModel): class TranscriptionResponseModel(BaseModel): text: str model: str - provider: str \ No newline at end of file + provider: str + +class AudioSpeechConfig(BaseModel): + input: str + model: Optional[str] = None + provider: Optional[str] = None + voice: Optional[str] = None + instrcutions: str = "Speech this text in a natural way." + response_format: Optional[str] = None \ No newline at end of file diff --git a/g4f/client/__init__.py b/g4f/client/__init__.py index 6152af1c..3165b160 100644 --- a/g4f/client/__init__.py +++ b/g4f/client/__init__.py @@ -45,6 +45,17 @@ def add_chunk(content, chunk): content = str(content) + str(chunk) return content +def resolve_media(kwargs: dict, image = None, image_name: str = None) -> None: + if image is not None: + kwargs["media"] = [(image, image_name)] + elif "images" in kwargs: + kwargs["media"] = kwargs.pop("images") + if "media" in kwargs and not isinstance(kwargs["media"], list): + kwargs["media"] = [kwargs["media"]] + for idx, media in enumerate(kwargs.get("media", [])): + if not isinstance(media, (list, tuple)): + kwargs["media"][idx] = (media, getattr(media, "name", None)) + # Synchronous iter_response function def iter_response( response: Union[Iterator[Union[str, ResponseType]]], @@ -296,13 +307,7 @@ class Completions: ) -> ChatCompletion: if isinstance(messages, str): messages = [{"role": "user", "content": messages}] - if image is not None: - kwargs["media"] = [(image, image_name)] - elif "images" in kwargs: - kwargs["media"] = kwargs.pop("images") - for idx, media in enumerate(kwargs.get("media", [])): - if not isinstance(media, (list, tuple)): - kwargs["media"][idx] = (media, getattr(media, "name", None)) + resolve_media(kwargs, image, image_name) if provider is None: provider = self.provider if provider is None: @@ -483,6 +488,7 @@ class Images: async def async_create_variation( self, image: ImageType, + image_name: str = None, model: Optional[str] = None, provider: Optional[ProviderType] = None, response_format: Optional[str] = None, @@ -494,11 +500,7 @@ class Images: if proxy is None: proxy = self.client.proxy prompt = "create a variation of this image" - if image is not None: - kwargs["media"] = image - for idx, media in enumerate(kwargs.get("media", [])): - if not isinstance(media, (list, tuple)): - kwargs["media"][idx] = (media, getattr(media, "name", None)) + resolve_media(kwargs, image, image_name) error = None response = None if isinstance(provider_handler, IterListProvider): @@ -600,13 +602,7 @@ class AsyncCompletions: ) -> Awaitable[ChatCompletion]: if isinstance(messages, str): messages = [{"role": "user", "content": messages}] - if image is not None: - kwargs["media"] = [(image, image_name)] - elif "images" in kwargs: - kwargs["media"] = kwargs.pop("images") - for idx, media in enumerate(kwargs.get("media", [])): - if not isinstance(media, (list, tuple)): - kwargs["media"][idx] = (media, getattr(media, "name", None)) + resolve_media(kwargs, image, image_name) if provider is None: provider = self.provider if provider is None: diff --git a/g4f/providers/any_provider.py b/g4f/providers/any_provider.py index ddaf73ce..a4845f3b 100644 --- a/g4f/providers/any_provider.py +++ b/g4f/providers/any_provider.py @@ -11,7 +11,7 @@ from ..Provider.hf_space import HuggingSpace from .. import Provider from .. import models from ..Provider import Cloudflare, LMArenaProvider, Gemini, Grok, DeepSeekAPI, PerplexityLabs, LambdaChat, PollinationsAI, FreeRouter -from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox +from ..Provider import Microsoft_Phi_4, DeepInfraChat, Blackbox, EdgeTTS, gTTS, MarkItDown from .base_provider import AsyncGeneratorProvider, ProviderModelMixin class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin): @@ -124,15 +124,17 @@ class AnyProvider(AsyncGeneratorProvider, ProviderModelMixin): elif not model or model == cls.default_model: model = "" has_image = False - has_audio = "audio" in kwargs + has_audio = False if not has_audio and media is not None: for media_data, filename in media: if is_data_an_audio(media_data, filename): has_audio = True break has_image = True - if has_audio: - providers = [PollinationsAI, Microsoft_Phi_4] + if "audio" in kwargs or "audio" in kwargs.get("modalities", []): + providers = [PollinationsAI, EdgeTTS, gTTS] + elif has_audio: + providers = [PollinationsAI, Microsoft_Phi_4, MarkItDown] elif has_image: providers = models.default_vision.best_provider.providers else: