Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support

2025-12-05 18:20:35 -08:00 · 2025-11-02 04:32:32 +01:00 · 2025-11-02 04:32:32 +01:00 · 5d53e58d2c
commit 5d53e58d2c
parent 8df4bc7118
4 changed files with 201 additions and 11 deletions
--- a/g4f/mcp/init.py
+++ b/g4f/mcp/init.py
@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
 """

 from .server import MCPServer
-from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
+from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool

-__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
+__all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']
--- a/g4f/mcp/server.py
+++ b/g4f/mcp/server.py
@ -20,6 +20,7 @@ from ..debug import enable_logging

 enable_logging()

+from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
 from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool


@ -54,6 +55,8 @@ class MCPServer:
            'web_search': WebSearchTool(),
            'web_scrape': WebScrapeTool(),
            'image_generation': ImageGenerationTool(),
+            'text_to_audio': TextToAudioTool(),
+            'mark_it_down': MarkItDownTool()
        }
        self.server_info = {
            "name": "gpt4free-mcp-server",
--- a/g4f/mcp/tools.py
+++ b/g4f/mcp/tools.py
@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:

 from __future__ import annotations

-import asyncio
 from typing import Any, Dict
 from abc import ABC, abstractmethod

@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
                    session=session,
                    url=url,
                    max_words=max_words,
-                    add_source=True
+                    add_metadata=True
                )
            
            if not content:
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
    
    @property
    def description(self) -> str:
-        return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data."
+        return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
    
    @property
    def input_schema(self) -> Dict[str, Any]:
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
            return {
                "error": f"Image generation failed: {str(e)}"
            }
+
+class MarkItDownTool(MCPTool):
+    """MarkItDown tool for converting URLs to markdown format"""
+    
+    @property
+    def description(self) -> str:
+        return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
+    
+    @property
+    def input_schema(self) -> Dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "url": {
+                    "type": "string",
+                    "description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
+                },
+                "max_content_length": {
+                    "type": "integer",
+                    "description": "Maximum content length for processing (default: 10000)",
+                    "default": 10000
+                }
+            },
+            "required": ["url"]
+        }
+    
+    async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute MarkItDown conversion
+        
+        Returns:
+            Dict[str, Any]: Markdown content or error message
+        """
+        try:
+            from ..integration.markitdown import MarkItDown
+        except ImportError as e:
+            return {
+                "error": f"MarkItDown is not installed: {str(e)}"
+            }
+        
+        url = arguments.get("url", "")
+        max_content_length = arguments.get("max_content_length", 10000)
+        
+        if not url:
+            return {
+                "error": "URL parameter is required"
+            }
+        
+        # Validate URL format
+        if not url.startswith(("http://", "https://")):
+            return {
+                "error": "URL must start with http:// or https://"
+            }
+        
+        try:
+            # Initialize MarkItDown
+            md = MarkItDown()
+            
+            # Convert URL to markdown
+            result = md.convert_url(url)
+            
+            if not result:
+                return {
+                    "error": "Failed to convert URL to markdown"
+                }
+            
+            # Truncate if content exceeds max length
+            if len(result) > max_content_length:
+                result = result[:max_content_length] + "\n\n[Content truncated...]"
+            
+            return {
+                "url": url,
+                "markdown_content": result,
+                "content_length": len(result),
+                "truncated": len(result) > max_content_length
+            }
+        
+        except Exception as e:
+            return {
+                "error": f"MarkItDown conversion failed: {str(e)}"
+            }
+
+class TextToAudioTool(MCPTool):
+    """TextToAudio tool for generating audio from text prompts using Pollinations AI"""
+    
+    @property
+    def description(self) -> str:
+        return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
+    
+    @property
+    def input_schema(self) -> Dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "prompt": {
+                    "type": "string",
+                    "description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
+                },
+                "voice": {
+                    "type": "string",
+                    "description": "Voice option for text-to-speech (default: 'alloy')",
+                    "default": "alloy"
+                },
+                "url_encode": {
+                    "type": "boolean",
+                    "description": "Whether to URL-encode the prompt text (default: True)",
+                    "default": True
+                }
+            },
+            "required": ["prompt"]
+        }
+    
+    async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute text-to-speech conversion
+        
+        Returns:
+            Dict[str, Any]: Audio URL or error message
+        """
+        try:
+            import urllib.parse
+        except ImportError as e:
+            return {
+                "error": f"urllib is not available: {str(e)}"
+            }
+        
+        prompt = arguments.get("prompt", "")
+        voice = arguments.get("voice", "alloy")
+        url_encode = arguments.get("url_encode", True)
+        
+        if not prompt:
+            return {
+                "error": "Prompt parameter is required"
+            }
+        
+        # Validate prompt length (reasonable limit for text-to-speech)
+        if len(prompt) > 5000:
+            return {
+                "error": "Prompt is too long (max 5000 characters)"
+            }
+        
+        try:
+            # Prepare the prompt for URL
+            if url_encode:
+                encoded_prompt = urllib.parse.quote(prompt)
+            else:
+                encoded_prompt = prompt.replace(" ", "%20")  # Basic space encoding
+            
+            # Construct the Pollinations AI text-to-speech URL
+            base_url = "https://text.pollinations.ai"
+            audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
+            
+            return {
+                "prompt": prompt,
+                "voice": voice,
+                "audio_url": audio_url
+            }
+        
+        except Exception as e:
+            return {
+                "error": f"Text-to-speech URL generation failed: {str(e)}"
+            }
--- a/g4f/tools/fetch_and_scrape.py
+++ b/g4f/tools/fetch_and_scrape.py
@ -3,14 +3,14 @@ from __future__ import annotations
 import hashlib
 import asyncio
 from pathlib import Path
-from typing import Iterator, Optional
+from typing import Dict, Iterator, Optional
 from urllib.parse import urlparse, quote_plus
 from aiohttp import ClientSession, ClientError
 from datetime import date
 import asyncio

 try:
-    from bs4 import BeautifulSoup
+    from bs4 import BeautifulSoup, Tag
    has_requirements = True
 except ImportError:
    has_requirements = False
@ -18,11 +18,39 @@ except ImportError:
 from ..cookies import get_cookies_dir
 from ..providers.response import format_link

-def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]:
+def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
    """
    Parses the provided HTML and yields text fragments.
    """
    soup = BeautifulSoup(html, "html.parser")
+
+    # Read the meta tags
+    if add_metadata:
+        metadata: Dict[str, str] = {}
+
+        if soup.title and soup.title.string:
+            yield  f"## {soup.title.string}\n"
+            seen_texts.append(soup.title.string)
+            max_words = None if max_words is None else max_words - len(soup.title.string.split())
+
+        for meta in soup(["meta"]):
+            if not isinstance(meta, Tag):
+                continue
+
+            for a in meta.attrs:
+                if a in ["itemprop", "property", "name"]:
+                    key = str(meta.get(a, ""))
+                    content = str(meta.get("content", ""))
+                    if key and content:  # Only add non-empty content
+                        metadata[key] = content
+                    break
+
+            description = metadata.get('description', metadata.get('og:description', '')).strip()
+            if description:
+                yield f"### Description\n{description}\n"
+                seen_texts.append(description)
+                max_words = None if max_words is None else max_words - len(description.split())
+
    for selector in [
        "main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
        ".content-wrapper", "#content", "#mainContent",
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
            domain = urlparse(link).netloc
            yield f"\nSource: [{domain}]({link})"

-async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str:
+async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
    """
    Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
    """
    try:
        cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
        cache_dir.mkdir(parents=True, exist_ok=True)
-        md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
+        md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
        cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
        if cache_file.exists():
            return cache_file.read_text()
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
        async with session.get(url, proxy=proxy) as response:
            if response.status == 200:
                html = await response.text(errors="replace")
-                scraped_text = "".join(scrape_text(html, max_words, add_source))
+                scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
                with open(cache_file, "wb") as f:
                    f.write(scraped_text.encode(errors="replace"))
                return scraped_text