from typing import BinaryIO, Any import asyncio from markitdown._base_converter import DocumentConverter, DocumentConverterResult from markitdown._stream_info import StreamInfo from markitdown.converters._llm_caption import llm_caption from markitdown.converters._exiftool import exiftool_metadata from ._base_converter import AsyncDocumentConverterResult ACCEPTED_MIME_TYPE_PREFIXES = [ "image/jpeg", "image/png", ] ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"] class ImageConverter(DocumentConverter): """ Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured). """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: md_content = "" # Add metadata metadata = exiftool_metadata( file_stream, exiftool_path=kwargs.get("exiftool_path") ) if metadata: for f in [ "ImageSize", "Title", "Caption", "Description", "Keywords", "Artist", "Author", "DateTimeOriginal", "CreateDate", "GPSPosition", ]: if f in metadata: md_content += f"{f}: {metadata[f]}\n" # Try describing the image with GPT llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: llm_description = llm_caption( file_stream, stream_info, client=llm_client, model=llm_model, prompt=kwargs.get("llm_prompt"), ) if asyncio.iscoroutine(llm_description): return AsyncDocumentConverterResult( llm_description, ) if llm_description is not None: md_content += "\n# Description:\n" + llm_description.strip() + "\n" return DocumentConverterResult( markdown=md_content, )