diff --git a/g4f/tools/files.py b/g4f/tools/files.py index 92ebb8dc..49113133 100644 --- a/g4f/tools/files.py +++ b/g4f/tools/files.py @@ -126,7 +126,7 @@ def get_buckets(): buckets_dir = os.path.join(get_cookies_dir(), "buckets") try: return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))] - except OSError as e: + except OSError: return None def spacy_refine_chunks(source_iterator): @@ -254,14 +254,14 @@ def read_bucket(bucket_dir: Path): cache_file = bucket_dir / PLAIN_CACHE spacy_file = bucket_dir / f"spacy_0001.cache" if not spacy_file.exists(): - yield cache_file.read_text() + yield cache_file.read_text(errors="replace") for idx in range(1, 1000): spacy_file = bucket_dir / f"spacy_{idx:04d}.cache" plain_file = bucket_dir / f"plain_{idx:04d}.cache" if spacy_file.exists(): - yield spacy_file.read_text() + yield spacy_file.read_text(errors="replace") elif plain_file.exists(): - yield plain_file.read_text() + yield plain_file.read_text(errors="replace") else: break @@ -277,7 +277,7 @@ def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) - cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache" if cache_file.exists(): with open(cache_file, "r") as f: - yield f.read() + yield f.read(errors="replace") continue if not part.exists(): break @@ -485,8 +485,8 @@ def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator elif "urls" in item: yield item -def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iterator[str]: - urls = get_downloads_urls(bucket_dir) +def read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> Iterator[str]: + urls = get_downloads_urls(bucket_dir, delete_files) if urls: count = 0 with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f: @@ -497,8 +497,8 @@ def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iter count += 1 yield f'data: {json.dumps({"action": "download", "count": count})}\n\n' -async def async_read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> AsyncIterator[str]: - urls = get_downloads_urls(bucket_dir) +async def async_read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> AsyncIterator[str]: + urls = get_downloads_urls(bucket_dir, delete_files) if urls: count = 0 with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f: @@ -513,7 +513,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi if refine_chunks_with_spacy: for chunk in stream_read_parts_and_refine(bucket_dir, delete_files): if event_stream: - size += len(chunk) + size += len(chunk.decode('utf-8')) yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n' else: yield chunk @@ -522,7 +522,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi streaming = cache_stream(streaming, bucket_dir) for chunk in streaming: if event_stream: - size += len(chunk) + size += len(chunk.decode('utf-8')) yield f'data: {json.dumps({"action": "load", "size": size})}\n\n' else: yield chunk @@ -541,7 +541,7 @@ def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spac bucket_dir = Path(bucket_dir) bucket_dir.mkdir(parents=True, exist_ok=True) try: - yield from read_and_download_urls(bucket_dir, event_stream) + yield from read_and_download_urls(bucket_dir, delete_files, event_stream) yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream) except Exception as e: if event_stream: @@ -552,7 +552,7 @@ async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chun bucket_dir = Path(bucket_dir) bucket_dir.mkdir(parents=True, exist_ok=True) try: - async for chunk in async_read_and_download_urls(bucket_dir, event_stream): + async for chunk in async_read_and_download_urls(bucket_dir, delete_files, event_stream): yield chunk for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream): yield chunk diff --git a/requirements-slim.txt b/requirements-slim.txt index 7e42781d..30cd9adc 100644 --- a/requirements-slim.txt +++ b/requirements-slim.txt @@ -15,4 +15,5 @@ beautifulsoup4 aiohttp_socks cryptography python-multipart -pypdf2 \ No newline at end of file +pypdf2 +python-docx \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 58a501f7..26a22d91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ cryptography nodriver python-multipart pypdf2 -docx \ No newline at end of file +python-docx \ No newline at end of file diff --git a/setup.py b/setup.py index 2114cf2f..4e905d73 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ EXTRA_REQUIRE = { "plyer", "setuptools", "pypdf2", # files - "docx", + "python-docx", "odfpy", "ebooklib", "openpyxl", @@ -58,6 +58,7 @@ EXTRA_REQUIRE = { "uvicorn", # api "python-multipart", "pypdf2", # files + "python-docx", ], "image": [ "pillow", @@ -92,7 +93,7 @@ EXTRA_REQUIRE = { "spacy", "beautifulsoup4", "pypdf2", - "docx", + "python-docx", "odfpy", "ebooklib", "openpyxl",