Fix errors on read buckets

Update requirements for docx
2025-12-06 02:30:41 -08:00 · 2025-02-19 17:10:03 +01:00 · 2025-02-19 17:10:03 +01:00 · 796d6d6e14
commit 796d6d6e14
parent 292d5b69cd
4 changed files with 19 additions and 17 deletions
--- a/g4f/tools/files.py
+++ b/g4f/tools/files.py
@ -126,7 +126,7 @@ def get_buckets():
    buckets_dir = os.path.join(get_cookies_dir(), "buckets")
    try:
        return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))]
-    except OSError as e:
+    except OSError:
        return None

 def spacy_refine_chunks(source_iterator):
@ -254,14 +254,14 @@ def read_bucket(bucket_dir: Path):
    cache_file = bucket_dir / PLAIN_CACHE
    spacy_file = bucket_dir / f"spacy_0001.cache"
    if not spacy_file.exists():
-        yield cache_file.read_text()
+        yield cache_file.read_text(errors="replace")
    for idx in range(1, 1000):
        spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
        plain_file = bucket_dir / f"plain_{idx:04d}.cache"
        if spacy_file.exists():
-            yield spacy_file.read_text()
+            yield spacy_file.read_text(errors="replace")
        elif plain_file.exists():
-            yield plain_file.read_text()
+            yield plain_file.read_text(errors="replace")
        else:
            break

@ -277,7 +277,7 @@ def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) -
        cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
        if cache_file.exists():
            with open(cache_file, "r") as f:
-                yield f.read()
+                yield f.read(errors="replace")
            continue
        if not part.exists():
            break
@ -485,8 +485,8 @@ def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator
                elif "urls" in item:
                    yield item

-def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iterator[str]:
-    urls = get_downloads_urls(bucket_dir)
+def read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> Iterator[str]:
+    urls = get_downloads_urls(bucket_dir, delete_files)
    if urls:
        count = 0
        with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
@ -497,8 +497,8 @@ def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iter
                        count += 1
                        yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'

-async def async_read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> AsyncIterator[str]:
-    urls = get_downloads_urls(bucket_dir)
+async def async_read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> AsyncIterator[str]:
+    urls = get_downloads_urls(bucket_dir, delete_files)
    if urls:
        count = 0
        with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
@ -513,7 +513,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
    if refine_chunks_with_spacy:
        for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
            if event_stream:
-                size += len(chunk)
+                size += len(chunk.decode('utf-8'))
                yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
            else:
                yield chunk
@ -522,7 +522,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
        streaming = cache_stream(streaming, bucket_dir)
        for chunk in streaming:
            if event_stream:
-                size += len(chunk)
+                size += len(chunk.decode('utf-8'))
                yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
            else:
                yield chunk
@ -541,7 +541,7 @@ def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spac
    bucket_dir = Path(bucket_dir)
    bucket_dir.mkdir(parents=True, exist_ok=True)
    try:
-        yield from read_and_download_urls(bucket_dir, event_stream)
+        yield from read_and_download_urls(bucket_dir, delete_files, event_stream)
        yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
    except Exception as e:
        if event_stream:
@ -552,7 +552,7 @@ async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chun
    bucket_dir = Path(bucket_dir)
    bucket_dir.mkdir(parents=True, exist_ok=True)
    try:
-        async for chunk in async_read_and_download_urls(bucket_dir, event_stream):
+        async for chunk in async_read_and_download_urls(bucket_dir, delete_files, event_stream):
            yield chunk
        for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
            yield chunk
--- a/requirements-slim.txt
+++ b/requirements-slim.txt
@ -16,3 +16,4 @@ aiohttp_socks
 cryptography
 python-multipart
 pypdf2
+python-docx
--- a/requirements.txt
+++ b/requirements.txt
@ -19,4 +19,4 @@ cryptography
 nodriver
 python-multipart
 pypdf2
-docx
+python-docx
--- a/setup.py
+++ b/setup.py
@ -40,7 +40,7 @@ EXTRA_REQUIRE = {
        "plyer",
        "setuptools",
        "pypdf2", # files
-        "docx",
+        "python-docx",
        "odfpy",
        "ebooklib",
        "openpyxl",
@ -58,6 +58,7 @@ EXTRA_REQUIRE = {
        "uvicorn",                 # api
        "python-multipart",
        "pypdf2", # files
+        "python-docx",
    ],
    "image": [
        "pillow",
@ -92,7 +93,7 @@ EXTRA_REQUIRE = {
        "spacy",
        "beautifulsoup4",
        "pypdf2",
-        "docx",
+        "python-docx",
        "odfpy",
        "ebooklib",
        "openpyxl",