Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support

2025-12-06 02:30:41 -08:00 · 2025-11-02 04:32:32 +01:00 · 2025-11-02 04:32:32 +01:00 · 5d53e58d2c
commit 5d53e58d2c
parent 8df4bc7118
4 changed files with 201 additions and 11 deletions
--- a/g4f/mcp/init.py
+++ b/g4f/mcp/init.py
@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
 """
 from .server import MCPServer
-from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
+from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
-__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
+__all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
--- a/g4f/mcp/server.py
+++ b/g4f/mcp/server.py
@ -20,6 +20,7 @@ from ..debug import enable_logging
 enable_logging()
 from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
 from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
@ -54,6 +55,8 @@ class MCPServer:
            'web_search': WebSearchTool(),
            'web_scrape': WebScrapeTool(),
            'image_generation': ImageGenerationTool(),
            'text_to_audio': TextToAudioTool(),
            'mark_it_down': MarkItDownTool()
        }
        self.server_info = {
            "name": "gpt4free-mcp-server",
--- a/g4f/mcp/tools.py
+++ b/g4f/mcp/tools.py
@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:
 from __future__ import annotations
 import asyncio
 from typing import Any, Dict
 from abc import ABC, abstractmethod
@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
                    session=session,
                    url=url,
                    max_words=max_words,
-                    add_source=True
+                    add_metadata=True
                )
            if not content:
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
    @property
    def description(self) -> str:
-        return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data."
+        return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
    @property
    def input_schema(self) -> Dict[str, Any]:
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
            return {
                "error": f"Image generation failed: {str(e)}"
            }
 class MarkItDownTool(MCPTool):
    """MarkItDown tool for converting URLs to markdown format"""
    @property
    def description(self) -> str:
        return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
    @property
    def input_schema(self) -> Dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
                },
                "max_content_length": {
                    "type": "integer",
                    "description": "Maximum content length for processing (default: 10000)",
                    "default": 10000
                }
            },
            "required": ["url"]
        }
    async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute MarkItDown conversion
        Returns:
            Dict[str, Any]: Markdown content or error message
        """
        try:
            from ..integration.markitdown import MarkItDown
        except ImportError as e:
            return {
                "error": f"MarkItDown is not installed: {str(e)}"
            }
        url = arguments.get("url", "")
        max_content_length = arguments.get("max_content_length", 10000)
        if not url:
            return {
                "error": "URL parameter is required"
            }
        # Validate URL format
        if not url.startswith(("http://", "https://")):
            return {
                "error": "URL must start with http:// or https://"
            }
        try:
            # Initialize MarkItDown
            md = MarkItDown()
            # Convert URL to markdown
            result = md.convert_url(url)
            if not result:
                return {
                    "error": "Failed to convert URL to markdown"
                }
            # Truncate if content exceeds max length
            if len(result) > max_content_length:
                result = result[:max_content_length] + "\n\n[Content truncated...]"
            return {
                "url": url,
                "markdown_content": result,
                "content_length": len(result),
                "truncated": len(result) > max_content_length
            }
        except Exception as e:
            return {
                "error": f"MarkItDown conversion failed: {str(e)}"
            }
 class TextToAudioTool(MCPTool):
    """TextToAudio tool for generating audio from text prompts using Pollinations AI"""
    @property
    def description(self) -> str:
        return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
    @property
    def input_schema(self) -> Dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "prompt": {
                    "type": "string",
                    "description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
                },
                "voice": {
                    "type": "string",
                    "description": "Voice option for text-to-speech (default: 'alloy')",
                    "default": "alloy"
                },
                "url_encode": {
                    "type": "boolean",
                    "description": "Whether to URL-encode the prompt text (default: True)",
                    "default": True
                }
            },
            "required": ["prompt"]
        }
    async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute text-to-speech conversion
        Returns:
            Dict[str, Any]: Audio URL or error message
        """
        try:
            import urllib.parse
        except ImportError as e:
            return {
                "error": f"urllib is not available: {str(e)}"
            }
        prompt = arguments.get("prompt", "")
        voice = arguments.get("voice", "alloy")
        url_encode = arguments.get("url_encode", True)
        if not prompt:
            return {
                "error": "Prompt parameter is required"
            }
        # Validate prompt length (reasonable limit for text-to-speech)
        if len(prompt) > 5000:
            return {
                "error": "Prompt is too long (max 5000 characters)"
            }
        try:
            # Prepare the prompt for URL
            if url_encode:
                encoded_prompt = urllib.parse.quote(prompt)
            else:
                encoded_prompt = prompt.replace(" ", "%20")  # Basic space encoding
            # Construct the Pollinations AI text-to-speech URL
            base_url = "https://text.pollinations.ai"
            audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
            return {
                "prompt": prompt,
                "voice": voice,
                "audio_url": audio_url
            }
        except Exception as e:
            return {
                "error": f"Text-to-speech URL generation failed: {str(e)}"
            }
--- a/g4f/tools/fetch_and_scrape.py
+++ b/g4f/tools/fetch_and_scrape.py
@ -3,14 +3,14 @@ from __future__ import annotations
 import hashlib
 import asyncio
 from pathlib import Path
-from typing import Iterator, Optional
+from typing import Dict, Iterator, Optional
 from urllib.parse import urlparse, quote_plus
 from aiohttp import ClientSession, ClientError
 from datetime import date
 import asyncio
 try:
-    from bs4 import BeautifulSoup
+    from bs4 import BeautifulSoup, Tag
    has_requirements = True
 except ImportError:
    has_requirements = False
@ -18,11 +18,39 @@ except ImportError:
 from ..cookies import get_cookies_dir
 from ..providers.response import format_link
-def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
+def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
    """
    Parses the provided HTML and yields text fragments.
    """
    soup = BeautifulSoup(html, "html.parser")
    # Read the meta tags
    if add_metadata:
        metadata: Dict[str, str] = {}
        if soup.title and soup.title.string:
            yield  f"## {soup.title.string}\n"
            seen_texts.append(soup.title.string)
            max_words = None if max_words is None else max_words - len(soup.title.string.split())
        for meta in soup(["meta"]):
            if not isinstance(meta, Tag):
                continue
            for a in meta.attrs:
                if a in ["itemprop", "property", "name"]:
                    key = str(meta.get(a, ""))
                    content = str(meta.get("content", ""))
                    if key and content:  # Only add non-empty content
                        metadata[key] = content
                    break
            description = metadata.get('description', metadata.get('og:description', '')).strip()
            if description:
                yield f"### Description\n{description}\n"
                seen_texts.append(description)
                max_words = None if max_words is None else max_words - len(description.split())
    for selector in [
        "main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
        ".content-wrapper", "#content", "#mainContent",
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
            domain = urlparse(link).netloc
            yield f"\nSource: [{domain}]({link})"
-async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
+async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
    """
    Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
    """
    try:
        cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
        cache_dir.mkdir(parents=True, exist_ok=True)
-        md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
+        md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
        cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
        if cache_file.exists():
            return cache_file.read_text()
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
        async with session.get(url, proxy=proxy) as response:
            if response.status == 200:
                html = await response.text(errors="replace")
-                scraped_text = "".join(scrape_text(html, max_words, add_source))
+                scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
                with open(cache_file, "wb") as f:
                    f.write(scraped_text.encode(errors="replace"))
                return scraped_text