mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-12-06 02:30:41 -08:00
Fix errors on read buckets
Update requirements for docx
This commit is contained in:
parent
292d5b69cd
commit
796d6d6e14
4 changed files with 19 additions and 17 deletions
|
|
@ -126,7 +126,7 @@ def get_buckets():
|
||||||
buckets_dir = os.path.join(get_cookies_dir(), "buckets")
|
buckets_dir = os.path.join(get_cookies_dir(), "buckets")
|
||||||
try:
|
try:
|
||||||
return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))]
|
return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))]
|
||||||
except OSError as e:
|
except OSError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def spacy_refine_chunks(source_iterator):
|
def spacy_refine_chunks(source_iterator):
|
||||||
|
|
@ -254,14 +254,14 @@ def read_bucket(bucket_dir: Path):
|
||||||
cache_file = bucket_dir / PLAIN_CACHE
|
cache_file = bucket_dir / PLAIN_CACHE
|
||||||
spacy_file = bucket_dir / f"spacy_0001.cache"
|
spacy_file = bucket_dir / f"spacy_0001.cache"
|
||||||
if not spacy_file.exists():
|
if not spacy_file.exists():
|
||||||
yield cache_file.read_text()
|
yield cache_file.read_text(errors="replace")
|
||||||
for idx in range(1, 1000):
|
for idx in range(1, 1000):
|
||||||
spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
|
spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
|
||||||
plain_file = bucket_dir / f"plain_{idx:04d}.cache"
|
plain_file = bucket_dir / f"plain_{idx:04d}.cache"
|
||||||
if spacy_file.exists():
|
if spacy_file.exists():
|
||||||
yield spacy_file.read_text()
|
yield spacy_file.read_text(errors="replace")
|
||||||
elif plain_file.exists():
|
elif plain_file.exists():
|
||||||
yield plain_file.read_text()
|
yield plain_file.read_text(errors="replace")
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -277,7 +277,7 @@ def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) -
|
||||||
cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
|
cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
|
||||||
if cache_file.exists():
|
if cache_file.exists():
|
||||||
with open(cache_file, "r") as f:
|
with open(cache_file, "r") as f:
|
||||||
yield f.read()
|
yield f.read(errors="replace")
|
||||||
continue
|
continue
|
||||||
if not part.exists():
|
if not part.exists():
|
||||||
break
|
break
|
||||||
|
|
@ -485,8 +485,8 @@ def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator
|
||||||
elif "urls" in item:
|
elif "urls" in item:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iterator[str]:
|
def read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> Iterator[str]:
|
||||||
urls = get_downloads_urls(bucket_dir)
|
urls = get_downloads_urls(bucket_dir, delete_files)
|
||||||
if urls:
|
if urls:
|
||||||
count = 0
|
count = 0
|
||||||
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
|
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
|
||||||
|
|
@ -497,8 +497,8 @@ def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iter
|
||||||
count += 1
|
count += 1
|
||||||
yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'
|
yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'
|
||||||
|
|
||||||
async def async_read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> AsyncIterator[str]:
|
async def async_read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> AsyncIterator[str]:
|
||||||
urls = get_downloads_urls(bucket_dir)
|
urls = get_downloads_urls(bucket_dir, delete_files)
|
||||||
if urls:
|
if urls:
|
||||||
count = 0
|
count = 0
|
||||||
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
|
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
|
||||||
|
|
@ -513,7 +513,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
|
||||||
if refine_chunks_with_spacy:
|
if refine_chunks_with_spacy:
|
||||||
for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
|
for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
|
||||||
if event_stream:
|
if event_stream:
|
||||||
size += len(chunk)
|
size += len(chunk.decode('utf-8'))
|
||||||
yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
|
yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
|
||||||
else:
|
else:
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
@ -522,7 +522,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
|
||||||
streaming = cache_stream(streaming, bucket_dir)
|
streaming = cache_stream(streaming, bucket_dir)
|
||||||
for chunk in streaming:
|
for chunk in streaming:
|
||||||
if event_stream:
|
if event_stream:
|
||||||
size += len(chunk)
|
size += len(chunk.decode('utf-8'))
|
||||||
yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
|
yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
|
||||||
else:
|
else:
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
@ -541,7 +541,7 @@ def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spac
|
||||||
bucket_dir = Path(bucket_dir)
|
bucket_dir = Path(bucket_dir)
|
||||||
bucket_dir.mkdir(parents=True, exist_ok=True)
|
bucket_dir.mkdir(parents=True, exist_ok=True)
|
||||||
try:
|
try:
|
||||||
yield from read_and_download_urls(bucket_dir, event_stream)
|
yield from read_and_download_urls(bucket_dir, delete_files, event_stream)
|
||||||
yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
|
yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if event_stream:
|
if event_stream:
|
||||||
|
|
@ -552,7 +552,7 @@ async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chun
|
||||||
bucket_dir = Path(bucket_dir)
|
bucket_dir = Path(bucket_dir)
|
||||||
bucket_dir.mkdir(parents=True, exist_ok=True)
|
bucket_dir.mkdir(parents=True, exist_ok=True)
|
||||||
try:
|
try:
|
||||||
async for chunk in async_read_and_download_urls(bucket_dir, event_stream):
|
async for chunk in async_read_and_download_urls(bucket_dir, delete_files, event_stream):
|
||||||
yield chunk
|
yield chunk
|
||||||
for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
|
for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
|
||||||
|
|
@ -16,3 +16,4 @@ aiohttp_socks
|
||||||
cryptography
|
cryptography
|
||||||
python-multipart
|
python-multipart
|
||||||
pypdf2
|
pypdf2
|
||||||
|
python-docx
|
||||||
|
|
@ -19,4 +19,4 @@ cryptography
|
||||||
nodriver
|
nodriver
|
||||||
python-multipart
|
python-multipart
|
||||||
pypdf2
|
pypdf2
|
||||||
docx
|
python-docx
|
||||||
5
setup.py
5
setup.py
|
|
@ -40,7 +40,7 @@ EXTRA_REQUIRE = {
|
||||||
"plyer",
|
"plyer",
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"pypdf2", # files
|
"pypdf2", # files
|
||||||
"docx",
|
"python-docx",
|
||||||
"odfpy",
|
"odfpy",
|
||||||
"ebooklib",
|
"ebooklib",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
|
|
@ -58,6 +58,7 @@ EXTRA_REQUIRE = {
|
||||||
"uvicorn", # api
|
"uvicorn", # api
|
||||||
"python-multipart",
|
"python-multipart",
|
||||||
"pypdf2", # files
|
"pypdf2", # files
|
||||||
|
"python-docx",
|
||||||
],
|
],
|
||||||
"image": [
|
"image": [
|
||||||
"pillow",
|
"pillow",
|
||||||
|
|
@ -92,7 +93,7 @@ EXTRA_REQUIRE = {
|
||||||
"spacy",
|
"spacy",
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"pypdf2",
|
"pypdf2",
|
||||||
"docx",
|
"python-docx",
|
||||||
"odfpy",
|
"odfpy",
|
||||||
"ebooklib",
|
"ebooklib",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue