Fix errors on read buckets

Update requirements for docx
This commit is contained in:
hlohaus 2025-02-19 17:10:03 +01:00
parent 292d5b69cd
commit 796d6d6e14
4 changed files with 19 additions and 17 deletions

View file

@ -126,7 +126,7 @@ def get_buckets():
buckets_dir = os.path.join(get_cookies_dir(), "buckets") buckets_dir = os.path.join(get_cookies_dir(), "buckets")
try: try:
return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))] return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))]
except OSError as e: except OSError:
return None return None
def spacy_refine_chunks(source_iterator): def spacy_refine_chunks(source_iterator):
@ -254,14 +254,14 @@ def read_bucket(bucket_dir: Path):
cache_file = bucket_dir / PLAIN_CACHE cache_file = bucket_dir / PLAIN_CACHE
spacy_file = bucket_dir / f"spacy_0001.cache" spacy_file = bucket_dir / f"spacy_0001.cache"
if not spacy_file.exists(): if not spacy_file.exists():
yield cache_file.read_text() yield cache_file.read_text(errors="replace")
for idx in range(1, 1000): for idx in range(1, 1000):
spacy_file = bucket_dir / f"spacy_{idx:04d}.cache" spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
plain_file = bucket_dir / f"plain_{idx:04d}.cache" plain_file = bucket_dir / f"plain_{idx:04d}.cache"
if spacy_file.exists(): if spacy_file.exists():
yield spacy_file.read_text() yield spacy_file.read_text(errors="replace")
elif plain_file.exists(): elif plain_file.exists():
yield plain_file.read_text() yield plain_file.read_text(errors="replace")
else: else:
break break
@ -277,7 +277,7 @@ def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) -
cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache" cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
if cache_file.exists(): if cache_file.exists():
with open(cache_file, "r") as f: with open(cache_file, "r") as f:
yield f.read() yield f.read(errors="replace")
continue continue
if not part.exists(): if not part.exists():
break break
@ -485,8 +485,8 @@ def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator
elif "urls" in item: elif "urls" in item:
yield item yield item
def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iterator[str]: def read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> Iterator[str]:
urls = get_downloads_urls(bucket_dir) urls = get_downloads_urls(bucket_dir, delete_files)
if urls: if urls:
count = 0 count = 0
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f: with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
@ -497,8 +497,8 @@ def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iter
count += 1 count += 1
yield f'data: {json.dumps({"action": "download", "count": count})}\n\n' yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'
async def async_read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> AsyncIterator[str]: async def async_read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> AsyncIterator[str]:
urls = get_downloads_urls(bucket_dir) urls = get_downloads_urls(bucket_dir, delete_files)
if urls: if urls:
count = 0 count = 0
with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f: with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
@ -513,7 +513,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
if refine_chunks_with_spacy: if refine_chunks_with_spacy:
for chunk in stream_read_parts_and_refine(bucket_dir, delete_files): for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
if event_stream: if event_stream:
size += len(chunk) size += len(chunk.decode('utf-8'))
yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n' yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
else: else:
yield chunk yield chunk
@ -522,7 +522,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
streaming = cache_stream(streaming, bucket_dir) streaming = cache_stream(streaming, bucket_dir)
for chunk in streaming: for chunk in streaming:
if event_stream: if event_stream:
size += len(chunk) size += len(chunk.decode('utf-8'))
yield f'data: {json.dumps({"action": "load", "size": size})}\n\n' yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
else: else:
yield chunk yield chunk
@ -541,7 +541,7 @@ def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spac
bucket_dir = Path(bucket_dir) bucket_dir = Path(bucket_dir)
bucket_dir.mkdir(parents=True, exist_ok=True) bucket_dir.mkdir(parents=True, exist_ok=True)
try: try:
yield from read_and_download_urls(bucket_dir, event_stream) yield from read_and_download_urls(bucket_dir, delete_files, event_stream)
yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream) yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
except Exception as e: except Exception as e:
if event_stream: if event_stream:
@ -552,7 +552,7 @@ async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chun
bucket_dir = Path(bucket_dir) bucket_dir = Path(bucket_dir)
bucket_dir.mkdir(parents=True, exist_ok=True) bucket_dir.mkdir(parents=True, exist_ok=True)
try: try:
async for chunk in async_read_and_download_urls(bucket_dir, event_stream): async for chunk in async_read_and_download_urls(bucket_dir, delete_files, event_stream):
yield chunk yield chunk
for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream): for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
yield chunk yield chunk

View file

@ -15,4 +15,5 @@ beautifulsoup4
aiohttp_socks aiohttp_socks
cryptography cryptography
python-multipart python-multipart
pypdf2 pypdf2
python-docx

View file

@ -19,4 +19,4 @@ cryptography
nodriver nodriver
python-multipart python-multipart
pypdf2 pypdf2
docx python-docx

View file

@ -40,7 +40,7 @@ EXTRA_REQUIRE = {
"plyer", "plyer",
"setuptools", "setuptools",
"pypdf2", # files "pypdf2", # files
"docx", "python-docx",
"odfpy", "odfpy",
"ebooklib", "ebooklib",
"openpyxl", "openpyxl",
@ -58,6 +58,7 @@ EXTRA_REQUIRE = {
"uvicorn", # api "uvicorn", # api
"python-multipart", "python-multipart",
"pypdf2", # files "pypdf2", # files
"python-docx",
], ],
"image": [ "image": [
"pillow", "pillow",
@ -92,7 +93,7 @@ EXTRA_REQUIRE = {
"spacy", "spacy",
"beautifulsoup4", "beautifulsoup4",
"pypdf2", "pypdf2",
"docx", "python-docx",
"odfpy", "odfpy",
"ebooklib", "ebooklib",
"openpyxl", "openpyxl",