Add MarkItDownTool and TextToAudioTool; enhance scrape_text and fetch_and_scrape functions with metadata support

This commit is contained in:
hlohaus 2025-11-02 04:32:32 +01:00
parent 8df4bc7118
commit 5d53e58d2c
4 changed files with 201 additions and 11 deletions

View file

@ -8,6 +8,6 @@ through the Model Context Protocol standard, allowing AI assistants to access:
""" """
from .server import MCPServer from .server import MCPServer
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
__all__ = ['MCPServer', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool'] __all__ = ['MCPServer', 'MarkItDownTool', 'TextToAudioTool', 'WebSearchTool', 'WebScrapeTool', 'ImageGenerationTool']

View file

@ -20,6 +20,7 @@ from ..debug import enable_logging
enable_logging() enable_logging()
from .tools import MarkItDownTool, TextToAudioTool, WebSearchTool, WebScrapeTool, ImageGenerationTool
from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool from .tools import WebSearchTool, WebScrapeTool, ImageGenerationTool
@ -54,6 +55,8 @@ class MCPServer:
'web_search': WebSearchTool(), 'web_search': WebSearchTool(),
'web_scrape': WebScrapeTool(), 'web_scrape': WebScrapeTool(),
'image_generation': ImageGenerationTool(), 'image_generation': ImageGenerationTool(),
'text_to_audio': TextToAudioTool(),
'mark_it_down': MarkItDownTool()
} }
self.server_info = { self.server_info = {
"name": "gpt4free-mcp-server", "name": "gpt4free-mcp-server",

View file

@ -8,7 +8,6 @@ This module provides MCP tool implementations that wrap gpt4free capabilities:
from __future__ import annotations from __future__ import annotations
import asyncio
from typing import Any, Dict from typing import Any, Dict
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
@ -158,7 +157,7 @@ class WebScrapeTool(MCPTool):
session=session, session=session,
url=url, url=url,
max_words=max_words, max_words=max_words,
add_source=True add_metadata=True
) )
if not content: if not content:
@ -183,7 +182,7 @@ class ImageGenerationTool(MCPTool):
@property @property
def description(self) -> str: def description(self) -> str:
return "Generate images from text prompts using AI image generation providers. Returns base64-encoded image data." return "Generate images from text prompts using AI image generation providers. Returns a URL to the generated image."
@property @property
def input_schema(self) -> Dict[str, Any]: def input_schema(self) -> Dict[str, Any]:
@ -291,3 +290,163 @@ class ImageGenerationTool(MCPTool):
return { return {
"error": f"Image generation failed: {str(e)}" "error": f"Image generation failed: {str(e)}"
} }
class MarkItDownTool(MCPTool):
"""MarkItDown tool for converting URLs to markdown format"""
@property
def description(self) -> str:
return "Convert a URL to markdown format using MarkItDown. Supports HTTP/HTTPS URLs and returns formatted markdown content."
@property
def input_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to convert to markdown format (must be HTTP/HTTPS)"
},
"max_content_length": {
"type": "integer",
"description": "Maximum content length for processing (default: 10000)",
"default": 10000
}
},
"required": ["url"]
}
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute MarkItDown conversion
Returns:
Dict[str, Any]: Markdown content or error message
"""
try:
from ..integration.markitdown import MarkItDown
except ImportError as e:
return {
"error": f"MarkItDown is not installed: {str(e)}"
}
url = arguments.get("url", "")
max_content_length = arguments.get("max_content_length", 10000)
if not url:
return {
"error": "URL parameter is required"
}
# Validate URL format
if not url.startswith(("http://", "https://")):
return {
"error": "URL must start with http:// or https://"
}
try:
# Initialize MarkItDown
md = MarkItDown()
# Convert URL to markdown
result = md.convert_url(url)
if not result:
return {
"error": "Failed to convert URL to markdown"
}
# Truncate if content exceeds max length
if len(result) > max_content_length:
result = result[:max_content_length] + "\n\n[Content truncated...]"
return {
"url": url,
"markdown_content": result,
"content_length": len(result),
"truncated": len(result) > max_content_length
}
except Exception as e:
return {
"error": f"MarkItDown conversion failed: {str(e)}"
}
class TextToAudioTool(MCPTool):
"""TextToAudio tool for generating audio from text prompts using Pollinations AI"""
@property
def description(self) -> str:
return "Generate an audio URL from a text prompt using Pollinations AI text-to-speech service. Returns a direct URL to the generated audio file."
@property
def input_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The text prompt to the audio model (example: 'Read this: Hello, world!')"
},
"voice": {
"type": "string",
"description": "Voice option for text-to-speech (default: 'alloy')",
"default": "alloy"
},
"url_encode": {
"type": "boolean",
"description": "Whether to URL-encode the prompt text (default: True)",
"default": True
}
},
"required": ["prompt"]
}
async def execute(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute text-to-speech conversion
Returns:
Dict[str, Any]: Audio URL or error message
"""
try:
import urllib.parse
except ImportError as e:
return {
"error": f"urllib is not available: {str(e)}"
}
prompt = arguments.get("prompt", "")
voice = arguments.get("voice", "alloy")
url_encode = arguments.get("url_encode", True)
if not prompt:
return {
"error": "Prompt parameter is required"
}
# Validate prompt length (reasonable limit for text-to-speech)
if len(prompt) > 5000:
return {
"error": "Prompt is too long (max 5000 characters)"
}
try:
# Prepare the prompt for URL
if url_encode:
encoded_prompt = urllib.parse.quote(prompt)
else:
encoded_prompt = prompt.replace(" ", "%20") # Basic space encoding
# Construct the Pollinations AI text-to-speech URL
base_url = "https://text.pollinations.ai"
audio_url = f"{base_url}/{encoded_prompt}?voice={voice}"
return {
"prompt": prompt,
"voice": voice,
"audio_url": audio_url
}
except Exception as e:
return {
"error": f"Text-to-speech URL generation failed: {str(e)}"
}

View file

@ -3,14 +3,14 @@ from __future__ import annotations
import hashlib import hashlib
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Iterator, Optional from typing import Dict, Iterator, Optional
from urllib.parse import urlparse, quote_plus from urllib.parse import urlparse, quote_plus
from aiohttp import ClientSession, ClientError from aiohttp import ClientSession, ClientError
from datetime import date from datetime import date
import asyncio import asyncio
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, Tag
has_requirements = True has_requirements = True
except ImportError: except ImportError:
has_requirements = False has_requirements = False
@ -18,11 +18,39 @@ except ImportError:
from ..cookies import get_cookies_dir from ..cookies import get_cookies_dir
from ..providers.response import format_link from ..providers.response import format_link
def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2) -> Iterator[str]: def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = True, count_images: int = 2, add_metadata: bool = False) -> Iterator[str]:
""" """
Parses the provided HTML and yields text fragments. Parses the provided HTML and yields text fragments.
""" """
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
# Read the meta tags
if add_metadata:
metadata: Dict[str, str] = {}
if soup.title and soup.title.string:
yield f"## {soup.title.string}\n"
seen_texts.append(soup.title.string)
max_words = None if max_words is None else max_words - len(soup.title.string.split())
for meta in soup(["meta"]):
if not isinstance(meta, Tag):
continue
for a in meta.attrs:
if a in ["itemprop", "property", "name"]:
key = str(meta.get(a, ""))
content = str(meta.get("content", ""))
if key and content: # Only add non-empty content
metadata[key] = content
break
description = metadata.get('description', metadata.get('og:description', '')).strip()
if description:
yield f"### Description\n{description}\n"
seen_texts.append(description)
max_words = None if max_words is None else max_words - len(description.split())
for selector in [ for selector in [
"main", ".main-content-wrapper", ".main-content", ".emt-container-inner", "main", ".main-content-wrapper", ".main-content", ".emt-container-inner",
".content-wrapper", "#content", "#mainContent", ".content-wrapper", "#content", "#mainContent",
@ -74,14 +102,14 @@ def scrape_text(html: str, max_words: Optional[int] = None, add_source: bool = T
domain = urlparse(link).netloc domain = urlparse(link).netloc
yield f"\nSource: [{domain}]({link})" yield f"\nSource: [{domain}]({link})"
async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, proxy: str = None) -> str: async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional[int] = None, add_source: bool = False, add_metadata: bool = False, proxy: str = None) -> str:
""" """
Fetches a URL and returns the scraped text, using caching to avoid redundant downloads. Fetches a URL and returns the scraped text, using caching to avoid redundant downloads.
""" """
try: try:
cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape" cache_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
cache_dir.mkdir(parents=True, exist_ok=True) cache_dir.mkdir(parents=True, exist_ok=True)
md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest() md5_hash = hashlib.md5(url.encode(errors="ignore")+str([max_words, add_source, add_metadata]).encode(errors="ignore")).hexdigest()
cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache" cache_file = cache_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{date.today()}.{md5_hash[:16]}.cache"
if cache_file.exists(): if cache_file.exists():
return cache_file.read_text() return cache_file.read_text()
@ -89,7 +117,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: Optional
async with session.get(url, proxy=proxy) as response: async with session.get(url, proxy=proxy) as response:
if response.status == 200: if response.status == 200:
html = await response.text(errors="replace") html = await response.text(errors="replace")
scraped_text = "".join(scrape_text(html, max_words, add_source)) scraped_text = "".join(scrape_text(html, max_words, add_source, add_metadata=add_metadata))
with open(cache_file, "wb") as f: with open(cache_file, "wb") as f:
f.write(scraped_text.encode(errors="replace")) f.write(scraped_text.encode(errors="replace"))
return scraped_text return scraped_text