Add YouTubeConverter for enhanced document conversion and update import paths

2025-12-05 18:20:35 -08:00 · 2025-10-31 03:07:21 +01:00 · 2025-10-31 03:07:21 +01:00 · f571237eb7
commit f571237eb7
parent da6c00e2a2
4 changed files with 251 additions and 3 deletions
--- a/.github/workflows/build-packages.yml
+++ b/.github/workflows/build-packages.yml
@ -430,6 +430,7 @@ jobs:
        with:
          tag_name: ${{ needs.prepare.outputs.version }}
          name: Release ${{ needs.prepare.outputs.version }}
+          append_body: true
          body: |
            ## g4f ${{ needs.prepare.outputs.version }}

@ -446,7 +447,7 @@ jobs:
            - macOS ARM64: `g4f-macos-${{ needs.prepare.outputs.version }}-arm64`

            **System Packages:**
-            - WinGet: `winget install g4f` (after manifest approval)
+            - WinGet: `winget install gpt4free`

            **Docker:**
            - `docker pull hlohaus789/g4f:${{ needs.prepare.outputs.version }}`
--- a/g4f/integration/markitdown/init.py
+++ b/g4f/integration/markitdown/init.py
@ -14,6 +14,7 @@ from markitdown._exceptions import (

 from ._audio_converter import AudioConverter
 from ._image_converter import ImageConverter
+from ._youtube_converter import YouTubeConverter

 class MarkItDown(BaseMarkItDown):
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
@ -26,6 +27,7 @@ class MarkItDown(BaseMarkItDown):
        super().__init__(**kwargs)
        self.register_converter(AudioConverter())
        self.register_converter(ImageConverter())
+        self.register_converter(YouTubeConverter())

    def _convert(
        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
--- a/g4f/integration/markitdown/_youtube_converter.py
+++ b/g4f/integration/markitdown/_youtube_converter.py
@ -0,0 +1,245 @@
+import json
+import time
+import re
+import bs4
+from typing import Any, BinaryIO, Dict, List, Union
+from urllib.parse import parse_qs, urlparse, unquote
+
+from markitdown._base_converter import DocumentConverter, DocumentConverterResult
+from markitdown._stream_info import StreamInfo
+
+# Optional YouTube transcription support
+try:
+    # Suppress some warnings on library import
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        # Patch submitted upstream to fix the SyntaxWarning
+        from youtube_transcript_api import YouTubeTranscriptApi
+
+    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
+except ModuleNotFoundError:
+    IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
+
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/html",
+    "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+    ".html",
+    ".htm",
+]
+
+
+class YouTubeConverter(DocumentConverter):
+    """Handle YouTube specially, focusing on the video title, description, and transcript."""
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        """
+        Make sure we're dealing with HTML content *from* YouTube.
+        """
+        url = stream_info.url or ""
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        url = unquote(url)
+        url = url.replace(r"\?", "?").replace(r"\=", "=")
+
+        if not url.startswith("https://www.youtube.com/watch?"):
+            # Not a YouTube URL
+            return False
+        
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        # Not HTML content
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Parse the stream
+        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        print(file_stream)
+        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+
+        # Read the meta tags
+        metadata: Dict[str, str] = {}
+
+        if soup.title and soup.title.string:
+            metadata["title"] = soup.title.string
+
+        for meta in soup(["meta"]):
+            if not isinstance(meta, bs4.Tag):
+                continue
+
+            for a in meta.attrs:
+                if a in ["itemprop", "property", "name"]:
+                    key = str(meta.get(a, ""))
+                    content = str(meta.get("content", ""))
+                    if key and content:  # Only add non-empty content
+                        metadata[key] = content
+                    break
+        
+        print(f"Extracted metadata keys: {list(metadata.keys())}")
+
+        # Try reading the description
+        try:
+            for script in soup(["script"]):
+                if not isinstance(script, bs4.Tag):
+                    continue
+                if not script.string:  # Skip empty scripts
+                    continue
+                content = script.string
+                if "ytInitialData" in content:
+                    match = re.search(r"var ytInitialData = ({.*?});", content)
+                    if match:
+                        data = json.loads(match.group(1))
+                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")
+                        if attrdesc and isinstance(attrdesc, dict):
+                            metadata["description"] = str(attrdesc.get("content", ""))
+                    break
+        except Exception as e:
+            print(f"Error extracting description: {e}")
+            pass
+
+        # Start preparing the page
+        webpage_text = "# YouTube\n"
+
+        title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
+        assert isinstance(title, str)
+
+        if title:
+            webpage_text += f"\n## {title}\n"
+
+        stats = ""
+        views = self._get(metadata, ["interactionCount"])  # type: ignore
+        if views:
+            stats += f"- **Views:** {views}\n"
+
+        keywords = self._get(metadata, ["keywords"])  # type: ignore
+        if keywords:
+            stats += f"- **Keywords:** {keywords}\n"
+
+        runtime = self._get(metadata, ["duration"])  # type: ignore
+        if runtime:
+            stats += f"- **Runtime:** {runtime}\n"
+
+        if len(stats) > 0:
+            webpage_text += f"\n### Video Metadata\n{stats}\n"
+
+        description = self._get(metadata, ["description", "og:description"])  # type: ignore
+        if description:
+            webpage_text += f"\n### Description\n{description}\n"
+
+        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            try:
+                ytt_api = YouTubeTranscriptApi()
+                transcript_text = ""
+                parsed_url = urlparse(stream_info.url)  # type: ignore
+                params = parse_qs(parsed_url.query)  # type: ignore
+                if "v" in params and params["v"][0]:
+                    video_id = str(params["v"][0])
+                    transcript_list = ytt_api.list(video_id)
+                    languages = ["en"]
+                    for transcript in transcript_list:
+                        languages.append(transcript.language_code)
+                        break
+                    try:
+                        youtube_transcript_languages = kwargs.get(
+                            "youtube_transcript_languages", languages
+                        )
+                        # Retry the transcript fetching operation
+                        transcript = self._retry_operation(
+                            lambda: ytt_api.fetch(
+                                video_id, languages=youtube_transcript_languages
+                            ),
+                            retries=3,  # Retry 3 times
+                            delay=2,  # 2 seconds delay between retries
+                        )
+
+                        if transcript:
+                            transcript_text = " ".join(
+                                [part.text for part in transcript]
+                            )  # type: ignore
+                    except Exception as e:
+                        # No transcript available
+                        if len(languages) == 1:
+                            print(f"Error fetching transcript: {e}")
+                        else:
+                            # Translate transcript into first kwarg
+                            transcript = (
+                                transcript_list.find_transcript(languages)
+                                .translate(youtube_transcript_languages[0])
+                                .fetch()
+                            )
+                            transcript_text = " ".join([part.text for part in transcript])
+                if transcript_text:
+                    webpage_text += f"\n### Transcript\n{transcript_text}\n"
+            except Exception as e:
+                print(f"Error processing transcript: {e}")
+                pass
+
+        title = title if title else (soup.title.string if soup.title else "")
+        assert isinstance(title, str)
+
+        return DocumentConverterResult(
+            markdown=webpage_text,
+            title=title,
+        )
+
+    def _get(
+        self,
+        metadata: Dict[str, str],
+        keys: List[str],
+        default: Union[str, None] = None,
+    ) -> Union[str, None]:
+        """Get first non-empty value from metadata matching given keys."""
+        for k in keys:
+            if k in metadata:
+                return metadata[k]
+        return default
+
+    def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
+        """Recursively search for a key in nested dictionary/list structures."""
+        if isinstance(json, list):
+            for elm in json:
+                ret = self._findKey(elm, key)
+                if ret is not None:
+                    return ret
+        elif isinstance(json, dict):
+            for k, v in json.items():
+                if k == key:
+                    return json[k]
+                if result := self._findKey(v, key):
+                    return result
+        return None
+
+    def _retry_operation(self, operation, retries=3, delay=2):
+        """Retries the operation if it fails."""
+        attempt = 0
+        while attempt < retries:
+            try:
+                return operation()  # Attempt the operation
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < retries - 1:
+                    time.sleep(delay)  # Wait before retrying
+                attempt += 1
+        # If all attempts fail, raise the last exception
+        raise Exception(f"Operation failed after {retries} attempts.")
--- a/g4f/tools/files.py
+++ b/g4f/tools/files.py
@ -69,7 +69,7 @@ try:
 except ImportError:
    has_beautifulsoup4 = False
 try:
-    from markitdown import MarkItDown
+    from g4f.integration.markitdown import MarkItDown
    has_markitdown = True
 except ImportError:
    has_markitdown = False
@ -434,7 +434,7 @@ async def download_urls(
            text_content = None
            if has_markitdown:
                try:
-                    text_content = md.convert(url).text_content
+                    text_content = md.convert_url(url).text_content
                    if text_content:
                        filename = get_filename_from_url(url)
                        target = bucket_dir / filename