Merge branch 'master' into gradient-clipping

2025-12-24 19:01:06 -08:00 · 2023-01-04 19:56:35 +03:00 · 2023-01-04 19:56:35 +03:00 · eeb1de4388
commit eeb1de4388
parent d85c2cb2d5 b7deea47ee
111 changed files with 7614 additions and 2403 deletions
--- a/modules/api/api.py
+++ b/modules/api/api.py
@ -1,18 +1,27 @@
 import base64
 import io
 import time
+import datetime
 import uvicorn
 from threading import Lock
-from gradio.processing_utils import encode_pil_to_base64, decode_base64_to_file, decode_base64_to_image
-from fastapi import APIRouter, Depends, FastAPI, HTTPException
+from io import BytesIO
+from gradio.processing_utils import decode_base64_to_file
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Response
+from fastapi.security import HTTPBasic, HTTPBasicCredentials
+from secrets import compare_digest
+
 import modules.shared as shared
+from modules import sd_samplers, deepbooru, sd_hijack
 from modules.api.models import *
 from modules.processing import StableDiffusionProcessingTxt2Img, StableDiffusionProcessingImg2Img, process_images
-from modules.sd_samplers import all_samplers
 from modules.extras import run_extras, run_pnginfo
-from PIL import PngImagePlugin
-from modules.sd_models import checkpoints_list
+from modules.textual_inversion.textual_inversion import create_embedding, train_embedding
+from modules.textual_inversion.preprocess import preprocess
+from modules.hypernetworks.hypernetwork import create_hypernetwork, train_hypernetwork
+from PIL import PngImagePlugin,Image
+from modules.sd_models import checkpoints_list, find_checkpoint_config
 from modules.realesrgan_model import get_realesrgan_models
+from modules import devices
 from typing import List

 def upscaler_to_index(name: str):
@ -22,8 +31,12 @@ def upscaler_to_index(name: str):
        raise HTTPException(status_code=400, detail=f"Invalid upscaler, needs to be on of these: {' , '.join([x.name for x in sd_upscalers])}")


-sampler_to_index = lambda name: next(filter(lambda row: name.lower() == row[1].name.lower(), enumerate(all_samplers)), None)
+def validate_sampler_name(name):
+    config = sd_samplers.all_samplers_map.get(name, None)
+    if config is None:
+        raise HTTPException(status_code=404, detail="Sampler not found")

+    return name

 def setUpscalers(req: dict):
    reqDict = vars(req)
@ -33,6 +46,10 @@ def setUpscalers(req: dict):
    reqDict.pop('upscaler_2')
    return reqDict

+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";")[1].split(",")[1]
+    return Image.open(BytesIO(base64.b64decode(encoding)))

 def encode_pil_to_base64(image):
    with io.BytesIO() as output_bytes:
@ -51,67 +68,104 @@ def encode_pil_to_base64(image):
        bytes_data = output_bytes.getvalue()
    return base64.b64encode(bytes_data)

+def api_middleware(app: FastAPI):
+    @app.middleware("http")
+    async def log_and_time(req: Request, call_next):
+        ts = time.time()
+        res: Response = await call_next(req)
+        duration = str(round(time.time() - ts, 4))
+        res.headers["X-Process-Time"] = duration
+        endpoint = req.scope.get('path', 'err')
+        if shared.cmd_opts.api_log and endpoint.startswith('/sdapi'):
+            print('API {t} {code} {prot}/{ver} {method} {endpoint} {cli} {duration}'.format(
+                t = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
+                code = res.status_code,
+                ver = req.scope.get('http_version', '0.0'),
+                cli = req.scope.get('client', ('0:0.0.0', 0))[0],
+                prot = req.scope.get('scheme', 'err'),
+                method = req.scope.get('method', 'err'),
+                endpoint = endpoint,
+                duration = duration,
+            ))
+        return res
+

 class Api:
    def __init__(self, app: FastAPI, queue_lock: Lock):
+        if shared.cmd_opts.api_auth:
+            self.credentials = dict()
+            for auth in shared.cmd_opts.api_auth.split(","):
+                user, password = auth.split(":")
+                self.credentials[user] = password
+
        self.router = APIRouter()
        self.app = app
        self.queue_lock = queue_lock
-        self.app.add_api_route("/sdapi/v1/txt2img", self.text2imgapi, methods=["POST"], response_model=TextToImageResponse)
-        self.app.add_api_route("/sdapi/v1/img2img", self.img2imgapi, methods=["POST"], response_model=ImageToImageResponse)
-        self.app.add_api_route("/sdapi/v1/extra-single-image", self.extras_single_image_api, methods=["POST"], response_model=ExtrasSingleImageResponse)
-        self.app.add_api_route("/sdapi/v1/extra-batch-images", self.extras_batch_images_api, methods=["POST"], response_model=ExtrasBatchImagesResponse)
-        self.app.add_api_route("/sdapi/v1/png-info", self.pnginfoapi, methods=["POST"], response_model=PNGInfoResponse)
-        self.app.add_api_route("/sdapi/v1/progress", self.progressapi, methods=["GET"], response_model=ProgressResponse)
-        self.app.add_api_route("/sdapi/v1/interrogate", self.interrogateapi, methods=["POST"])
-        self.app.add_api_route("/sdapi/v1/interrupt", self.interruptapi, methods=["POST"])
-        self.app.add_api_route("/sdapi/v1/options", self.get_config, methods=["GET"], response_model=OptionsModel)
-        self.app.add_api_route("/sdapi/v1/options", self.set_config, methods=["POST"])
-        self.app.add_api_route("/sdapi/v1/cmd-flags", self.get_cmd_flags, methods=["GET"], response_model=FlagsModel)
-        self.app.add_api_route("/sdapi/v1/samplers", self.get_samplers, methods=["GET"], response_model=List[SamplerItem])
-        self.app.add_api_route("/sdapi/v1/upscalers", self.get_upscalers, methods=["GET"], response_model=List[UpscalerItem])
-        self.app.add_api_route("/sdapi/v1/sd-models", self.get_sd_models, methods=["GET"], response_model=List[SDModelItem])
-        self.app.add_api_route("/sdapi/v1/hypernetworks", self.get_hypernetworks, methods=["GET"], response_model=List[HypernetworkItem])
-        self.app.add_api_route("/sdapi/v1/face-restorers", self.get_face_restorers, methods=["GET"], response_model=List[FaceRestorerItem])
-        self.app.add_api_route("/sdapi/v1/realesrgan-models", self.get_realesrgan_models, methods=["GET"], response_model=List[RealesrganItem])
-        self.app.add_api_route("/sdapi/v1/prompt-styles", self.get_promp_styles, methods=["GET"], response_model=List[PromptStyleItem])
-        self.app.add_api_route("/sdapi/v1/artist-categories", self.get_artists_categories, methods=["GET"], response_model=List[str])
-        self.app.add_api_route("/sdapi/v1/artists", self.get_artists, methods=["GET"], response_model=List[ArtistItem])
+        api_middleware(self.app)
+        self.add_api_route("/sdapi/v1/txt2img", self.text2imgapi, methods=["POST"], response_model=TextToImageResponse)
+        self.add_api_route("/sdapi/v1/img2img", self.img2imgapi, methods=["POST"], response_model=ImageToImageResponse)
+        self.add_api_route("/sdapi/v1/extra-single-image", self.extras_single_image_api, methods=["POST"], response_model=ExtrasSingleImageResponse)
+        self.add_api_route("/sdapi/v1/extra-batch-images", self.extras_batch_images_api, methods=["POST"], response_model=ExtrasBatchImagesResponse)
+        self.add_api_route("/sdapi/v1/png-info", self.pnginfoapi, methods=["POST"], response_model=PNGInfoResponse)
+        self.add_api_route("/sdapi/v1/progress", self.progressapi, methods=["GET"], response_model=ProgressResponse)
+        self.add_api_route("/sdapi/v1/interrogate", self.interrogateapi, methods=["POST"])
+        self.add_api_route("/sdapi/v1/interrupt", self.interruptapi, methods=["POST"])
+        self.add_api_route("/sdapi/v1/skip", self.skip, methods=["POST"])
+        self.add_api_route("/sdapi/v1/options", self.get_config, methods=["GET"], response_model=OptionsModel)
+        self.add_api_route("/sdapi/v1/options", self.set_config, methods=["POST"])
+        self.add_api_route("/sdapi/v1/cmd-flags", self.get_cmd_flags, methods=["GET"], response_model=FlagsModel)
+        self.add_api_route("/sdapi/v1/samplers", self.get_samplers, methods=["GET"], response_model=List[SamplerItem])
+        self.add_api_route("/sdapi/v1/upscalers", self.get_upscalers, methods=["GET"], response_model=List[UpscalerItem])
+        self.add_api_route("/sdapi/v1/sd-models", self.get_sd_models, methods=["GET"], response_model=List[SDModelItem])
+        self.add_api_route("/sdapi/v1/hypernetworks", self.get_hypernetworks, methods=["GET"], response_model=List[HypernetworkItem])
+        self.add_api_route("/sdapi/v1/face-restorers", self.get_face_restorers, methods=["GET"], response_model=List[FaceRestorerItem])
+        self.add_api_route("/sdapi/v1/realesrgan-models", self.get_realesrgan_models, methods=["GET"], response_model=List[RealesrganItem])
+        self.add_api_route("/sdapi/v1/prompt-styles", self.get_prompt_styles, methods=["GET"], response_model=List[PromptStyleItem])
+        self.add_api_route("/sdapi/v1/artist-categories", self.get_artists_categories, methods=["GET"], response_model=List[str])
+        self.add_api_route("/sdapi/v1/artists", self.get_artists, methods=["GET"], response_model=List[ArtistItem])
+        self.add_api_route("/sdapi/v1/embeddings", self.get_embeddings, methods=["GET"], response_model=EmbeddingsResponse)
+        self.add_api_route("/sdapi/v1/refresh-checkpoints", self.refresh_checkpoints, methods=["POST"])
+        self.add_api_route("/sdapi/v1/create/embedding", self.create_embedding, methods=["POST"], response_model=CreateResponse)
+        self.add_api_route("/sdapi/v1/create/hypernetwork", self.create_hypernetwork, methods=["POST"], response_model=CreateResponse)
+        self.add_api_route("/sdapi/v1/preprocess", self.preprocess, methods=["POST"], response_model=PreprocessResponse)
+        self.add_api_route("/sdapi/v1/train/embedding", self.train_embedding, methods=["POST"], response_model=TrainResponse)
+        self.add_api_route("/sdapi/v1/train/hypernetwork", self.train_hypernetwork, methods=["POST"], response_model=TrainResponse)
+
+    def add_api_route(self, path: str, endpoint, **kwargs):
+        if shared.cmd_opts.api_auth:
+            return self.app.add_api_route(path, endpoint, dependencies=[Depends(self.auth)], **kwargs)
+        return self.app.add_api_route(path, endpoint, **kwargs)
+
+    def auth(self, credentials: HTTPBasicCredentials = Depends(HTTPBasic())):
+        if credentials.username in self.credentials:
+            if compare_digest(credentials.password, self.credentials[credentials.username]):
+                return True
+
+        raise HTTPException(status_code=401, detail="Incorrect username or password", headers={"WWW-Authenticate": "Basic"})

    def text2imgapi(self, txt2imgreq: StableDiffusionTxt2ImgProcessingAPI):
-        sampler_index = sampler_to_index(txt2imgreq.sampler_index)
-
-        if sampler_index is None:
-            raise HTTPException(status_code=404, detail="Sampler not found")
-
        populate = txt2imgreq.copy(update={ # Override __init__ params
-            "sd_model": shared.sd_model,
-            "sampler_index": sampler_index[0],
+            "sampler_name": validate_sampler_name(txt2imgreq.sampler_name or txt2imgreq.sampler_index),
            "do_not_save_samples": True,
            "do_not_save_grid": True
            }
        )
-        p = StableDiffusionProcessingTxt2Img(**vars(populate))
-        # Override object param
-
-        shared.state.begin()
+        if populate.sampler_name:
+            populate.sampler_index = None  # prevent a warning later on

        with self.queue_lock:
-            processed = process_images(p)
+            p = StableDiffusionProcessingTxt2Img(sd_model=shared.sd_model, **vars(populate))
+
+            shared.state.begin()
+            processed = process_images(p)
+            shared.state.end()

-        shared.state.end()

        b64images = list(map(encode_pil_to_base64, processed.images))

        return TextToImageResponse(images=b64images, parameters=vars(txt2imgreq), info=processed.js())

    def img2imgapi(self, img2imgreq: StableDiffusionImg2ImgProcessingAPI):
-        sampler_index = sampler_to_index(img2imgreq.sampler_index)
-
-        if sampler_index is None:
-            raise HTTPException(status_code=404, detail="Sampler not found")
-
-
        init_images = img2imgreq.init_images
        if init_images is None:
            raise HTTPException(status_code=404, detail="Init image not found")
@ -120,34 +174,30 @@ class Api:
        if mask:
            mask = decode_base64_to_image(mask)

-
        populate = img2imgreq.copy(update={ # Override __init__ params
-            "sd_model": shared.sd_model,
-            "sampler_index": sampler_index[0],
+            "sampler_name": validate_sampler_name(img2imgreq.sampler_name or img2imgreq.sampler_index),
            "do_not_save_samples": True,
            "do_not_save_grid": True,
            "mask": mask
            }
        )
-        p = StableDiffusionProcessingImg2Img(**vars(populate))
+        if populate.sampler_name:
+            populate.sampler_index = None  # prevent a warning later on

-        imgs = []
-        for img in init_images:
-            img = decode_base64_to_image(img)
-            imgs = [img] * p.batch_size
-
-        p.init_images = imgs
-
-        shared.state.begin()
+        args = vars(populate)
+        args.pop('include_init_images', None)  # this is meant to be done by "exclude": True in model, but it's for a reason that I cannot determine.

        with self.queue_lock:
-            processed = process_images(p)
+            p = StableDiffusionProcessingImg2Img(sd_model=shared.sd_model, **args)
+            p.init_images = [decode_base64_to_image(x) for x in init_images]

-        shared.state.end()
+            shared.state.begin()
+            processed = process_images(p)
+            shared.state.end()

        b64images = list(map(encode_pil_to_base64, processed.images))

-        if (not img2imgreq.include_init_images):
+        if not img2imgreq.include_init_images:
            img2imgreq.init_images = None
            img2imgreq.mask = None

@ -159,7 +209,7 @@ class Api:
        reqDict['image'] = decode_base64_to_image(reqDict['image'])

        with self.queue_lock:
-            result = run_extras(extras_mode=0, image_folder="", input_dir="", output_dir="", **reqDict)
+            result = run_extras(extras_mode=0, image_folder="", input_dir="", output_dir="", save_output=False, **reqDict)

        return ExtrasSingleImageResponse(image=encode_pil_to_base64(result[0][0]), html_info=result[1])

@ -175,7 +225,7 @@ class Api:
        reqDict.pop('imageList')

        with self.queue_lock:
-            result = run_extras(extras_mode=1, image="", input_dir="", output_dir="", **reqDict)
+            result = run_extras(extras_mode=1, image="", input_dir="", output_dir="", save_output=False, **reqDict)

        return ExtrasBatchImagesResponse(images=list(map(encode_pil_to_base64, result[0])), html_info=result[1])

@ -218,14 +268,20 @@ class Api:
    def interrogateapi(self, interrogatereq: InterrogateRequest):
        image_b64 = interrogatereq.image
        if image_b64 is None:
-            raise HTTPException(status_code=404, detail="Image not found") 
+            raise HTTPException(status_code=404, detail="Image not found")

-        img = self.__base64_to_image(image_b64)
+        img = decode_base64_to_image(image_b64)
+        img = img.convert('RGB')

        # Override object param
        with self.queue_lock:
-            processed = shared.interrogator.interrogate(img)
-        
+            if interrogatereq.model == "clip":
+                processed = shared.interrogator.interrogate(img)
+            elif interrogatereq.model == "deepdanbooru":
+                processed = deepbooru.model.tag(img)
+            else:
+                raise HTTPException(status_code=404, detail="Model not found")
+
        return InterrogateResponse(caption=processed)

    def interruptapi(self):
@ -233,6 +289,9 @@ class Api:

        return {}

+    def skip(self):
+        shared.state.skip()
+
    def get_config(self):
        options = {}
        for key in shared.opts.data.keys():
@ -244,14 +303,9 @@ class Api:

        return options

-    def set_config(self, req: OptionsModel):
-        # currently req has all options fields even if you send a dict like { "send_seed": false }, which means it will
-        # overwrite all options with default values.
-        raise RuntimeError('Setting options via API is not supported')
-
-        reqDict = vars(req)
-        for o in reqDict:
-            setattr(shared.opts, o, reqDict[o])
+    def set_config(self, req: Dict[str, Any]):
+        for k, v in req.items():
+            shared.opts.set(k, v)

        shared.opts.save(shared.config_filename)
        return
@ -260,7 +314,7 @@ class Api:
        return vars(shared.cmd_opts)

    def get_samplers(self):
-        return [{"name":sampler[0], "aliases":sampler[2], "options":sampler[3]} for sampler in all_samplers]
+        return [{"name": sampler[0], "aliases":sampler[2], "options":sampler[3]} for sampler in sd_samplers.all_samplers]

    def get_upscalers(self):
        upscalers = []
@ -272,7 +326,7 @@ class Api:
        return upscalers

    def get_sd_models(self):
-        return [{"title":x.title, "model_name":x.model_name, "hash":x.hash, "filename": x.filename, "config": x.config} for x in checkpoints_list.values()]
+        return [{"title":x.title, "model_name":x.model_name, "hash":x.hash, "filename": x.filename, "config": find_checkpoint_config(x)} for x in checkpoints_list.values()]

    def get_hypernetworks(self):
        return [{"name": name, "path": shared.hypernetworks[name]} for name in shared.hypernetworks]
@ -283,11 +337,11 @@ class Api:
    def get_realesrgan_models(self):
        return [{"name":x.name,"path":x.data_path, "scale":x.scale} for x in get_realesrgan_models(None)]

-    def get_promp_styles(self):
+    def get_prompt_styles(self):
        styleList = []
        for k in shared.prompt_styles.styles:
            style = shared.prompt_styles.styles[k]
-            styleList.append({"name":style[0], "prompt": style[1], "negative_prompr": style[2]})
+            styleList.append({"name":style[0], "prompt": style[1], "negative_prompt": style[2]})

        return styleList

@ -297,6 +351,112 @@ class Api:
    def get_artists(self):
        return [{"name":x[0], "score":x[1], "category":x[2]} for x in shared.artist_db.artists]

+    def get_embeddings(self):
+        db = sd_hijack.model_hijack.embedding_db
+
+        def convert_embedding(embedding):
+            return {
+                "step": embedding.step,
+                "sd_checkpoint": embedding.sd_checkpoint,
+                "sd_checkpoint_name": embedding.sd_checkpoint_name,
+                "shape": embedding.shape,
+                "vectors": embedding.vectors,
+            }
+
+        def convert_embeddings(embeddings):
+            return {embedding.name: convert_embedding(embedding) for embedding in embeddings.values()}
+
+        return {
+            "loaded": convert_embeddings(db.word_embeddings),
+            "skipped": convert_embeddings(db.skipped_embeddings),
+        }
+
+    def refresh_checkpoints(self):
+        shared.refresh_checkpoints()
+
+    def create_embedding(self, args: dict):
+        try:
+            shared.state.begin()
+            filename = create_embedding(**args) # create empty embedding
+            sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings() # reload embeddings so new one can be immediately used
+            shared.state.end()
+            return CreateResponse(info = "create embedding filename: {filename}".format(filename = filename))
+        except AssertionError as e:
+            shared.state.end()
+            return TrainResponse(info = "create embedding error: {error}".format(error = e))
+
+    def create_hypernetwork(self, args: dict):
+        try:
+            shared.state.begin()
+            filename = create_hypernetwork(**args) # create empty embedding
+            shared.state.end()
+            return CreateResponse(info = "create hypernetwork filename: {filename}".format(filename = filename))
+        except AssertionError as e:
+            shared.state.end()
+            return TrainResponse(info = "create hypernetwork error: {error}".format(error = e))
+
+    def preprocess(self, args: dict):
+        try:
+            shared.state.begin()
+            preprocess(**args) # quick operation unless blip/booru interrogation is enabled
+            shared.state.end()
+            return PreprocessResponse(info = 'preprocess complete')
+        except KeyError as e:
+            shared.state.end()
+            return PreprocessResponse(info = "preprocess error: invalid token: {error}".format(error = e))
+        except AssertionError as e:
+            shared.state.end()
+            return PreprocessResponse(info = "preprocess error: {error}".format(error = e))
+        except FileNotFoundError as e:
+            shared.state.end()
+            return PreprocessResponse(info = 'preprocess error: {error}'.format(error = e))
+
+    def train_embedding(self, args: dict):
+        try:
+            shared.state.begin()
+            apply_optimizations = shared.opts.training_xattention_optimizations
+            error = None
+            filename = ''
+            if not apply_optimizations:
+                sd_hijack.undo_optimizations()
+            try:
+                embedding, filename = train_embedding(**args) # can take a long time to complete
+            except Exception as e:
+                error = e
+            finally:
+                if not apply_optimizations:
+                    sd_hijack.apply_optimizations()
+                shared.state.end()
+            return TrainResponse(info = "train embedding complete: filename: {filename} error: {error}".format(filename = filename, error = error))
+        except AssertionError as msg:
+            shared.state.end()
+            return TrainResponse(info = "train embedding error: {msg}".format(msg = msg))
+
+    def train_hypernetwork(self, args: dict):
+        try:
+            shared.state.begin()
+            initial_hypernetwork = shared.loaded_hypernetwork
+            apply_optimizations = shared.opts.training_xattention_optimizations
+            error = None
+            filename = ''
+            if not apply_optimizations:
+                sd_hijack.undo_optimizations()
+            try:
+                hypernetwork, filename = train_hypernetwork(*args)
+            except Exception as e:
+                error = e
+            finally:
+                shared.loaded_hypernetwork = initial_hypernetwork
+                shared.sd_model.cond_stage_model.to(devices.device)
+                shared.sd_model.first_stage_model.to(devices.device)
+                if not apply_optimizations:
+                    sd_hijack.apply_optimizations()
+                shared.state.end()
+            return TrainResponse(info = "train embedding complete: filename: {filename} error: {error}".format(filename = filename, error = error))
+        except AssertionError as msg:
+            shared.state.end()
+            return TrainResponse(info = "train embedding error: {error}".format(error = error))
+
    def launch(self, server_name, port):
        self.app.include_router(self.router)
        uvicorn.run(self.app, host=server_name, port=port)
--- a/modules/api/models.py
+++ b/modules/api/models.py
@ -128,7 +128,7 @@ class ExtrasBaseRequest(BaseModel):
    upscaling_resize: float = Field(default=2, title="Upscaling Factor", ge=1, le=4, description="By how much to upscale the image, only used when resize_mode=0.")
    upscaling_resize_w: int = Field(default=512, title="Target Width", ge=1, description="Target width for the upscaler to hit. Only used when resize_mode=1.")
    upscaling_resize_h: int = Field(default=512, title="Target Height", ge=1, description="Target height for the upscaler to hit. Only used when resize_mode=1.")
-    upscaling_crop: bool = Field(default=True, title="Crop to fit", description="Should the upscaler crop the image to fit in the choosen size?")
+    upscaling_crop: bool = Field(default=True, title="Crop to fit", description="Should the upscaler crop the image to fit in the chosen size?")
    upscaler_1: str = Field(default="None", title="Main upscaler", description=f"The name of the main upscaler to use, it has to be one of this list: {' , '.join([x.name for x in sd_upscalers])}")
    upscaler_2: str = Field(default="None", title="Secondary upscaler", description=f"The name of the secondary upscaler to use, it has to be one of this list: {' , '.join([x.name for x in sd_upscalers])}")
    extras_upscaler_2_visibility: float = Field(default=0, title="Secondary upscaler visibility", ge=0, le=1, allow_inf_nan=False, description="Sets the visibility of secondary upscaler, values should be between 0 and 1.")
@ -170,14 +170,24 @@ class ProgressResponse(BaseModel):

 class InterrogateRequest(BaseModel):
    image: str = Field(default="", title="Image", description="Image to work on, must be a Base64 string containing the image's data.")
+    model: str = Field(default="clip", title="Model", description="The interrogate model used.")

 class InterrogateResponse(BaseModel):
    caption: str = Field(default=None, title="Caption", description="The generated caption for the image.")

+class TrainResponse(BaseModel):
+    info: str = Field(title="Train info", description="Response string from train embedding or hypernetwork task.")
+
+class CreateResponse(BaseModel):
+    info: str = Field(title="Create info", description="Response string from create embedding or hypernetwork task.")
+
+class PreprocessResponse(BaseModel):
+    info: str = Field(title="Preprocess info", description="Response string from preprocessing task.")
+
 fields = {}
-for key, value in opts.data.items():
-    metadata = opts.data_labels.get(key)
-    optType = opts.typemap.get(type(value), type(value))
+for key, metadata in opts.data_labels.items():
+    value = opts.data.get(key)
+    optType = opts.typemap.get(type(metadata.default), type(value))

    if (metadata is not None):
        fields.update({key: (Optional[optType], Field(
@ -239,3 +249,13 @@ class ArtistItem(BaseModel):
    score: float = Field(title="Score")
    category: str = Field(title="Category")

+class EmbeddingItem(BaseModel):
+    step: Optional[int] = Field(title="Step", description="The number of steps that were used to train this embedding, if available")
+    sd_checkpoint: Optional[str] = Field(title="SD Checkpoint", description="The hash of the checkpoint this embedding was trained on, if available")
+    sd_checkpoint_name: Optional[str] = Field(title="SD Checkpoint Name", description="The name of the checkpoint this embedding was trained on, if available. Note that this is the name that was used by the trainer; for a stable identifier, use `sd_checkpoint` instead")
+    shape: int = Field(title="Shape", description="The length of each individual vector in the embedding")
+    vectors: int = Field(title="Vectors", description="The number of vectors in the embedding")
+
+class EmbeddingsResponse(BaseModel):
+    loaded: Dict[str, EmbeddingItem] = Field(title="Loaded", description="Embeddings loaded for the current model")
+    skipped: Dict[str, EmbeddingItem] = Field(title="Skipped", description="Embeddings skipped for the current model (likely due to architecture incompatibility)")
--- a/modules/call_queue.py
+++ b/modules/call_queue.py
@ -0,0 +1,98 @@
+import html
+import sys
+import threading
+import traceback
+import time
+
+from modules import shared
+
+queue_lock = threading.Lock()
+
+
+def wrap_queued_call(func):
+    def f(*args, **kwargs):
+        with queue_lock:
+            res = func(*args, **kwargs)
+
+        return res
+
+    return f
+
+
+def wrap_gradio_gpu_call(func, extra_outputs=None):
+    def f(*args, **kwargs):
+
+        shared.state.begin()
+
+        with queue_lock:
+            res = func(*args, **kwargs)
+
+        shared.state.end()
+
+        return res
+
+    return wrap_gradio_call(f, extra_outputs=extra_outputs, add_stats=True)
+
+
+def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
+    def f(*args, extra_outputs_array=extra_outputs, **kwargs):
+        run_memmon = shared.opts.memmon_poll_rate > 0 and not shared.mem_mon.disabled and add_stats
+        if run_memmon:
+            shared.mem_mon.monitor()
+        t = time.perf_counter()
+
+        try:
+            res = list(func(*args, **kwargs))
+        except Exception as e:
+            # When printing out our debug argument list, do not print out more than a MB of text
+            max_debug_str_len = 131072 # (1024*1024)/8
+
+            print("Error completing request", file=sys.stderr)
+            argStr = f"Arguments: {str(args)} {str(kwargs)}"
+            print(argStr[:max_debug_str_len], file=sys.stderr)
+            if len(argStr) > max_debug_str_len:
+                print(f"(Argument list truncated at {max_debug_str_len}/{len(argStr)} characters)", file=sys.stderr)
+
+            print(traceback.format_exc(), file=sys.stderr)
+
+            shared.state.job = ""
+            shared.state.job_count = 0
+
+            if extra_outputs_array is None:
+                extra_outputs_array = [None, '']
+
+            res = extra_outputs_array + [f"<div class='error'>{html.escape(type(e).__name__+': '+str(e))}</div>"]
+
+        shared.state.skipped = False
+        shared.state.interrupted = False
+        shared.state.job_count = 0
+
+        if not add_stats:
+            return tuple(res)
+
+        elapsed = time.perf_counter() - t
+        elapsed_m = int(elapsed // 60)
+        elapsed_s = elapsed % 60
+        elapsed_text = f"{elapsed_s:.2f}s"
+        if elapsed_m > 0:
+            elapsed_text = f"{elapsed_m}m "+elapsed_text
+
+        if run_memmon:
+            mem_stats = {k: -(v//-(1024*1024)) for k, v in shared.mem_mon.stop().items()}
+            active_peak = mem_stats['active_peak']
+            reserved_peak = mem_stats['reserved_peak']
+            sys_peak = mem_stats['system_peak']
+            sys_total = mem_stats['total']
+            sys_pct = round(sys_peak/max(sys_total, 1) * 100, 2)
+
+            vram_html = f"<p class='vram'>Torch active/reserved: {active_peak}/{reserved_peak} MiB, <wbr>Sys VRAM: {sys_peak}/{sys_total} MiB ({sys_pct}%)</p>"
+        else:
+            vram_html = ''
+
+        # last item is always HTML
+        res[-1] += f"<div class='performance'><p class='time'>Time taken: <wbr>{elapsed_text}</p>{vram_html}</div>"
+
+        return tuple(res)
+
+    return f
+
--- a/modules/codeformer/vqgan_arch.py
+++ b/modules/codeformer/vqgan_arch.py
@ -382,7 +382,7 @@ class VQAutoEncoder(nn.Module):
                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
                logger.info(f'vqgan is loaded from: {model_path} [params]')
            else:
-                raise ValueError(f'Wrong params!')
+                raise ValueError('Wrong params!')


    def forward(self, x):
@ -431,7 +431,7 @@ class VQGANDiscriminator(nn.Module):
            elif 'params' in chkpt:
                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
            else:
-                raise ValueError(f'Wrong params!')
+                raise ValueError('Wrong params!')

    def forward(self, x):
        return self.main(x)
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@ -36,6 +36,7 @@ def setup_model(dirname):
        from basicsr.utils.download_util import load_file_from_url
        from basicsr.utils import imwrite, img2tensor, tensor2img
        from facelib.utils.face_restoration_helper import FaceRestoreHelper
+        from facelib.detection.retinaface import retinaface
        from modules.shared import cmd_opts

        net_class = CodeFormer
@ -65,6 +66,8 @@ def setup_model(dirname):
                net.load_state_dict(checkpoint)
                net.eval()

+                if hasattr(retinaface, 'device'):
+                    retinaface.device = devices.device_codeformer
                face_helper = FaceRestoreHelper(1, face_size=512, crop_ratio=(1, 1), det_model='retinaface_resnet50', save_ext='png', use_parse=True, device=devices.device_codeformer)

                self.net = net
--- a/modules/deepbooru.py
+++ b/modules/deepbooru.py
@ -1,173 +1,99 @@
-import os.path
-from concurrent.futures import ProcessPoolExecutor
-import multiprocessing
-import time
+import os
 import re

+import torch
+from PIL import Image
+import numpy as np
+
+from modules import modelloader, paths, deepbooru_model, devices, images, shared
+
 re_special = re.compile(r'([\\()])')

-def get_deepbooru_tags(pil_image):
-    """
-    This method is for running only one image at a time for simple use.  Used to the img2img interrogate.
-    """
-    from modules import shared  # prevents circular reference

-    try:
-        create_deepbooru_process(shared.opts.interrogate_deepbooru_score_threshold, create_deepbooru_opts())
-        return get_tags_from_process(pil_image)
-    finally:
-        release_process()
+class DeepDanbooru:
+    def __init__(self):
+        self.model = None

+    def load(self):
+        if self.model is not None:
+            return

-OPT_INCLUDE_RANKS = "include_ranks"
-def create_deepbooru_opts():
-    from modules import shared
+        files = modelloader.load_models(
+            model_path=os.path.join(paths.models_path, "torch_deepdanbooru"),
+            model_url='https://github.com/AUTOMATIC1111/TorchDeepDanbooru/releases/download/v1/model-resnet_custom_v3.pt',
+            ext_filter=[".pt"],
+            download_name='model-resnet_custom_v3.pt',
+        )

-    return {
-        "use_spaces": shared.opts.deepbooru_use_spaces,
-        "use_escape": shared.opts.deepbooru_escape,
-        "alpha_sort": shared.opts.deepbooru_sort_alpha,
-        OPT_INCLUDE_RANKS: shared.opts.interrogate_return_ranks,
-    }
+        self.model = deepbooru_model.DeepDanbooruModel()
+        self.model.load_state_dict(torch.load(files[0], map_location="cpu"))

+        self.model.eval()
+        self.model.to(devices.cpu, devices.dtype)

-def deepbooru_process(queue, deepbooru_process_return, threshold, deepbooru_opts):
-    model, tags = get_deepbooru_tags_model()
-    while True: # while process is running, keep monitoring queue for new image
-        pil_image = queue.get()
-        if pil_image == "QUIT":
-            break
-        else:
-            deepbooru_process_return["value"] = get_deepbooru_tags_from_model(model, tags, pil_image, threshold, deepbooru_opts)
+    def start(self):
+        self.load()
+        self.model.to(devices.device)

+    def stop(self):
+        if not shared.opts.interrogate_keep_models_in_memory:
+            self.model.to(devices.cpu)
+            devices.torch_gc()

-def create_deepbooru_process(threshold, deepbooru_opts):
-    """
-    Creates deepbooru process.  A queue is created to send images into the process.  This enables multiple images
-    to be processed in a row without reloading the model or creating a new process.  To return the data, a shared
-    dictionary is created to hold the tags created.  To wait for tags to be returned, a value of -1 is assigned
-    to the dictionary and the method adding the image to the queue should wait for this value to be updated with
-    the tags.
-    """
-    from modules import shared  # prevents circular reference
-    context = multiprocessing.get_context("spawn")
-    shared.deepbooru_process_manager = context.Manager()
-    shared.deepbooru_process_queue = shared.deepbooru_process_manager.Queue()
-    shared.deepbooru_process_return = shared.deepbooru_process_manager.dict()
-    shared.deepbooru_process_return["value"] = -1
-    shared.deepbooru_process = context.Process(target=deepbooru_process, args=(shared.deepbooru_process_queue, shared.deepbooru_process_return, threshold, deepbooru_opts))
-    shared.deepbooru_process.start()
+    def tag(self, pil_image):
+        self.start()
+        res = self.tag_multi(pil_image)
+        self.stop()

+        return res

-def get_tags_from_process(image):
-    from modules import shared
+    def tag_multi(self, pil_image, force_disable_ranks=False):
+        threshold = shared.opts.interrogate_deepbooru_score_threshold
+        use_spaces = shared.opts.deepbooru_use_spaces
+        use_escape = shared.opts.deepbooru_escape
+        alpha_sort = shared.opts.deepbooru_sort_alpha
+        include_ranks = shared.opts.interrogate_return_ranks and not force_disable_ranks

-    shared.deepbooru_process_return["value"] = -1
-    shared.deepbooru_process_queue.put(image)
-    while shared.deepbooru_process_return["value"] == -1:
-        time.sleep(0.2)
-    caption = shared.deepbooru_process_return["value"]
-    shared.deepbooru_process_return["value"] = -1
+        pic = images.resize_image(2, pil_image.convert("RGB"), 512, 512)
+        a = np.expand_dims(np.array(pic, dtype=np.float32), 0) / 255

-    return caption
+        with torch.no_grad(), devices.autocast():
+            x = torch.from_numpy(a).to(devices.device)
+            y = self.model(x)[0].detach().cpu().numpy()

+        probability_dict = {}

-def release_process():
-    """
-    Stops the deepbooru process to return used memory
-    """
-    from modules import shared  # prevents circular reference
-    shared.deepbooru_process_queue.put("QUIT")
-    shared.deepbooru_process.join()
-    shared.deepbooru_process_queue = None
-    shared.deepbooru_process = None
-    shared.deepbooru_process_return = None
-    shared.deepbooru_process_manager = None
+        for tag, probability in zip(self.model.tags, y):
+            if probability < threshold:
+                continue

-def get_deepbooru_tags_model():
-    import deepdanbooru as dd
-    import tensorflow as tf
-    import numpy as np
-    this_folder = os.path.dirname(__file__)
-    model_path = os.path.abspath(os.path.join(this_folder, '..', 'models', 'deepbooru'))
-    if not os.path.exists(os.path.join(model_path, 'project.json')):
-        # there is no point importing these every time
-        import zipfile
-        from basicsr.utils.download_util import load_file_from_url
-        load_file_from_url(
-            r"https://github.com/KichangKim/DeepDanbooru/releases/download/v3-20211112-sgd-e28/deepdanbooru-v3-20211112-sgd-e28.zip",
-            model_path)
-        with zipfile.ZipFile(os.path.join(model_path, "deepdanbooru-v3-20211112-sgd-e28.zip"), "r") as zip_ref:
-            zip_ref.extractall(model_path)
-        os.remove(os.path.join(model_path, "deepdanbooru-v3-20211112-sgd-e28.zip"))
-
-    tags = dd.project.load_tags_from_project(model_path)
-    model = dd.project.load_model_from_project(
-        model_path, compile_model=False
-    )
-    return model, tags
-
-
-def get_deepbooru_tags_from_model(model, tags, pil_image, threshold, deepbooru_opts):
-    import deepdanbooru as dd
-    import tensorflow as tf
-    import numpy as np
-
-    alpha_sort = deepbooru_opts['alpha_sort']
-    use_spaces = deepbooru_opts['use_spaces']
-    use_escape = deepbooru_opts['use_escape']
-    include_ranks = deepbooru_opts['include_ranks']
-
-    width = model.input_shape[2]
-    height = model.input_shape[1]
-    image = np.array(pil_image)
-    image = tf.image.resize(
-        image,
-        size=(height, width),
-        method=tf.image.ResizeMethod.AREA,
-        preserve_aspect_ratio=True,
-    )
-    image = image.numpy()  # EagerTensor to np.array
-    image = dd.image.transform_and_pad_image(image, width, height)
-    image = image / 255.0
-    image_shape = image.shape
-    image = image.reshape((1, image_shape[0], image_shape[1], image_shape[2]))
-
-    y = model.predict(image)[0]
-
-    result_dict = {}
-
-    for i, tag in enumerate(tags):
-        result_dict[tag] = y[i]
-
-    unsorted_tags_in_theshold = []
-    result_tags_print = []
-    for tag in tags:
-        if result_dict[tag] >= threshold:
            if tag.startswith("rating:"):
                continue
-            unsorted_tags_in_theshold.append((result_dict[tag], tag))
-            result_tags_print.append(f'{result_dict[tag]} {tag}')

-    # sort tags
-    result_tags_out = []
-    sort_ndx = 0
-    if alpha_sort:
-        sort_ndx = 1
+            probability_dict[tag] = probability

-    # sort by reverse by likelihood and normal for alpha, and format tag text as requested
-    unsorted_tags_in_theshold.sort(key=lambda y: y[sort_ndx], reverse=(not alpha_sort))
-    for weight, tag in unsorted_tags_in_theshold:
-        tag_outformat = tag
-        if use_spaces:
-            tag_outformat = tag_outformat.replace('_', ' ')
-        if use_escape:
-            tag_outformat = re.sub(re_special, r'\\\1', tag_outformat)
-        if include_ranks:
-            tag_outformat = f"({tag_outformat}:{weight:.3f})"
+        if alpha_sort:
+            tags = sorted(probability_dict)
+        else:
+            tags = [tag for tag, _ in sorted(probability_dict.items(), key=lambda x: -x[1])]

-        result_tags_out.append(tag_outformat)
+        res = []

-    print('\n'.join(sorted(result_tags_print, reverse=True)))
+        filtertags = set([x.strip().replace(' ', '_') for x in shared.opts.deepbooru_filter_tags.split(",")])

-    return ', '.join(result_tags_out)
+        for tag in [x for x in tags if x not in filtertags]:
+            probability = probability_dict[tag]
+            tag_outformat = tag
+            if use_spaces:
+                tag_outformat = tag_outformat.replace('_', ' ')
+            if use_escape:
+                tag_outformat = re.sub(re_special, r'\\\1', tag_outformat)
+            if include_ranks:
+                tag_outformat = f"({tag_outformat}:{probability:.3f})"
+
+            res.append(tag_outformat)
+
+        return ", ".join(res)
+
+
+model = DeepDanbooru()
--- a/modules/deepbooru_model.py
+++ b/modules/deepbooru_model.py
@ -0,0 +1,676 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# see https://github.com/AUTOMATIC1111/TorchDeepDanbooru for more
+
+
+class DeepDanbooruModel(nn.Module):
+    def __init__(self):
+        super(DeepDanbooruModel, self).__init__()
+
+        self.tags = []
+
+        self.n_Conv_0 = nn.Conv2d(kernel_size=(7, 7), in_channels=3, out_channels=64, stride=(2, 2))
+        self.n_MaxPool_0 = nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2))
+        self.n_Conv_1 = nn.Conv2d(kernel_size=(1, 1), in_channels=64, out_channels=256)
+        self.n_Conv_2 = nn.Conv2d(kernel_size=(1, 1), in_channels=64, out_channels=64)
+        self.n_Conv_3 = nn.Conv2d(kernel_size=(3, 3), in_channels=64, out_channels=64)
+        self.n_Conv_4 = nn.Conv2d(kernel_size=(1, 1), in_channels=64, out_channels=256)
+        self.n_Conv_5 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=64)
+        self.n_Conv_6 = nn.Conv2d(kernel_size=(3, 3), in_channels=64, out_channels=64)
+        self.n_Conv_7 = nn.Conv2d(kernel_size=(1, 1), in_channels=64, out_channels=256)
+        self.n_Conv_8 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=64)
+        self.n_Conv_9 = nn.Conv2d(kernel_size=(3, 3), in_channels=64, out_channels=64)
+        self.n_Conv_10 = nn.Conv2d(kernel_size=(1, 1), in_channels=64, out_channels=256)
+        self.n_Conv_11 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=512, stride=(2, 2))
+        self.n_Conv_12 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=128)
+        self.n_Conv_13 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128, stride=(2, 2))
+        self.n_Conv_14 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_15 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_16 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_17 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_18 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_19 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_20 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_21 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_22 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_23 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_24 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_25 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_26 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_27 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_28 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_29 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_30 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_31 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_32 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_33 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=128)
+        self.n_Conv_34 = nn.Conv2d(kernel_size=(3, 3), in_channels=128, out_channels=128)
+        self.n_Conv_35 = nn.Conv2d(kernel_size=(1, 1), in_channels=128, out_channels=512)
+        self.n_Conv_36 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=1024, stride=(2, 2))
+        self.n_Conv_37 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=256)
+        self.n_Conv_38 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256, stride=(2, 2))
+        self.n_Conv_39 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_40 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_41 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_42 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_43 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_44 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_45 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_46 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_47 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_48 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_49 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_50 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_51 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_52 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_53 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_54 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_55 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_56 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_57 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_58 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_59 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_60 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_61 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_62 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_63 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_64 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_65 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_66 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_67 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_68 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_69 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_70 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_71 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_72 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_73 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_74 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_75 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_76 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_77 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_78 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_79 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_80 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_81 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_82 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_83 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_84 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_85 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_86 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_87 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_88 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_89 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_90 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_91 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_92 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_93 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_94 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_95 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_96 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_97 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_98 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256, stride=(2, 2))
+        self.n_Conv_99 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_100 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=1024, stride=(2, 2))
+        self.n_Conv_101 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_102 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_103 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_104 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_105 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_106 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_107 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_108 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_109 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_110 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_111 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_112 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_113 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_114 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_115 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_116 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_117 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_118 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_119 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_120 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_121 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_122 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_123 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_124 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_125 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_126 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_127 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_128 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_129 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_130 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_131 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_132 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_133 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_134 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_135 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_136 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_137 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_138 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_139 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_140 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_141 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_142 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_143 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_144 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_145 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_146 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_147 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_148 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_149 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_150 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_151 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_152 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_153 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_154 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_155 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=256)
+        self.n_Conv_156 = nn.Conv2d(kernel_size=(3, 3), in_channels=256, out_channels=256)
+        self.n_Conv_157 = nn.Conv2d(kernel_size=(1, 1), in_channels=256, out_channels=1024)
+        self.n_Conv_158 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=2048, stride=(2, 2))
+        self.n_Conv_159 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=512)
+        self.n_Conv_160 = nn.Conv2d(kernel_size=(3, 3), in_channels=512, out_channels=512, stride=(2, 2))
+        self.n_Conv_161 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=2048)
+        self.n_Conv_162 = nn.Conv2d(kernel_size=(1, 1), in_channels=2048, out_channels=512)
+        self.n_Conv_163 = nn.Conv2d(kernel_size=(3, 3), in_channels=512, out_channels=512)
+        self.n_Conv_164 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=2048)
+        self.n_Conv_165 = nn.Conv2d(kernel_size=(1, 1), in_channels=2048, out_channels=512)
+        self.n_Conv_166 = nn.Conv2d(kernel_size=(3, 3), in_channels=512, out_channels=512)
+        self.n_Conv_167 = nn.Conv2d(kernel_size=(1, 1), in_channels=512, out_channels=2048)
+        self.n_Conv_168 = nn.Conv2d(kernel_size=(1, 1), in_channels=2048, out_channels=4096, stride=(2, 2))
+        self.n_Conv_169 = nn.Conv2d(kernel_size=(1, 1), in_channels=2048, out_channels=1024)
+        self.n_Conv_170 = nn.Conv2d(kernel_size=(3, 3), in_channels=1024, out_channels=1024, stride=(2, 2))
+        self.n_Conv_171 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=4096)
+        self.n_Conv_172 = nn.Conv2d(kernel_size=(1, 1), in_channels=4096, out_channels=1024)
+        self.n_Conv_173 = nn.Conv2d(kernel_size=(3, 3), in_channels=1024, out_channels=1024)
+        self.n_Conv_174 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=4096)
+        self.n_Conv_175 = nn.Conv2d(kernel_size=(1, 1), in_channels=4096, out_channels=1024)
+        self.n_Conv_176 = nn.Conv2d(kernel_size=(3, 3), in_channels=1024, out_channels=1024)
+        self.n_Conv_177 = nn.Conv2d(kernel_size=(1, 1), in_channels=1024, out_channels=4096)
+        self.n_Conv_178 = nn.Conv2d(kernel_size=(1, 1), in_channels=4096, out_channels=9176, bias=False)
+
+    def forward(self, *inputs):
+        t_358, = inputs
+        t_359 = t_358.permute(*[0, 3, 1, 2])
+        t_359_padded = F.pad(t_359, [2, 3, 2, 3], value=0)
+        t_360 = self.n_Conv_0(t_359_padded)
+        t_361 = F.relu(t_360)
+        t_361 = F.pad(t_361, [0, 1, 0, 1], value=float('-inf'))
+        t_362 = self.n_MaxPool_0(t_361)
+        t_363 = self.n_Conv_1(t_362)
+        t_364 = self.n_Conv_2(t_362)
+        t_365 = F.relu(t_364)
+        t_365_padded = F.pad(t_365, [1, 1, 1, 1], value=0)
+        t_366 = self.n_Conv_3(t_365_padded)
+        t_367 = F.relu(t_366)
+        t_368 = self.n_Conv_4(t_367)
+        t_369 = torch.add(t_368, t_363)
+        t_370 = F.relu(t_369)
+        t_371 = self.n_Conv_5(t_370)
+        t_372 = F.relu(t_371)
+        t_372_padded = F.pad(t_372, [1, 1, 1, 1], value=0)
+        t_373 = self.n_Conv_6(t_372_padded)
+        t_374 = F.relu(t_373)
+        t_375 = self.n_Conv_7(t_374)
+        t_376 = torch.add(t_375, t_370)
+        t_377 = F.relu(t_376)
+        t_378 = self.n_Conv_8(t_377)
+        t_379 = F.relu(t_378)
+        t_379_padded = F.pad(t_379, [1, 1, 1, 1], value=0)
+        t_380 = self.n_Conv_9(t_379_padded)
+        t_381 = F.relu(t_380)
+        t_382 = self.n_Conv_10(t_381)
+        t_383 = torch.add(t_382, t_377)
+        t_384 = F.relu(t_383)
+        t_385 = self.n_Conv_11(t_384)
+        t_386 = self.n_Conv_12(t_384)
+        t_387 = F.relu(t_386)
+        t_387_padded = F.pad(t_387, [0, 1, 0, 1], value=0)
+        t_388 = self.n_Conv_13(t_387_padded)
+        t_389 = F.relu(t_388)
+        t_390 = self.n_Conv_14(t_389)
+        t_391 = torch.add(t_390, t_385)
+        t_392 = F.relu(t_391)
+        t_393 = self.n_Conv_15(t_392)
+        t_394 = F.relu(t_393)
+        t_394_padded = F.pad(t_394, [1, 1, 1, 1], value=0)
+        t_395 = self.n_Conv_16(t_394_padded)
+        t_396 = F.relu(t_395)
+        t_397 = self.n_Conv_17(t_396)
+        t_398 = torch.add(t_397, t_392)
+        t_399 = F.relu(t_398)
+        t_400 = self.n_Conv_18(t_399)
+        t_401 = F.relu(t_400)
+        t_401_padded = F.pad(t_401, [1, 1, 1, 1], value=0)
+        t_402 = self.n_Conv_19(t_401_padded)
+        t_403 = F.relu(t_402)
+        t_404 = self.n_Conv_20(t_403)
+        t_405 = torch.add(t_404, t_399)
+        t_406 = F.relu(t_405)
+        t_407 = self.n_Conv_21(t_406)
+        t_408 = F.relu(t_407)
+        t_408_padded = F.pad(t_408, [1, 1, 1, 1], value=0)
+        t_409 = self.n_Conv_22(t_408_padded)
+        t_410 = F.relu(t_409)
+        t_411 = self.n_Conv_23(t_410)
+        t_412 = torch.add(t_411, t_406)
+        t_413 = F.relu(t_412)
+        t_414 = self.n_Conv_24(t_413)
+        t_415 = F.relu(t_414)
+        t_415_padded = F.pad(t_415, [1, 1, 1, 1], value=0)
+        t_416 = self.n_Conv_25(t_415_padded)
+        t_417 = F.relu(t_416)
+        t_418 = self.n_Conv_26(t_417)
+        t_419 = torch.add(t_418, t_413)
+        t_420 = F.relu(t_419)
+        t_421 = self.n_Conv_27(t_420)
+        t_422 = F.relu(t_421)
+        t_422_padded = F.pad(t_422, [1, 1, 1, 1], value=0)
+        t_423 = self.n_Conv_28(t_422_padded)
+        t_424 = F.relu(t_423)
+        t_425 = self.n_Conv_29(t_424)
+        t_426 = torch.add(t_425, t_420)
+        t_427 = F.relu(t_426)
+        t_428 = self.n_Conv_30(t_427)
+        t_429 = F.relu(t_428)
+        t_429_padded = F.pad(t_429, [1, 1, 1, 1], value=0)
+        t_430 = self.n_Conv_31(t_429_padded)
+        t_431 = F.relu(t_430)
+        t_432 = self.n_Conv_32(t_431)
+        t_433 = torch.add(t_432, t_427)
+        t_434 = F.relu(t_433)
+        t_435 = self.n_Conv_33(t_434)
+        t_436 = F.relu(t_435)
+        t_436_padded = F.pad(t_436, [1, 1, 1, 1], value=0)
+        t_437 = self.n_Conv_34(t_436_padded)
+        t_438 = F.relu(t_437)
+        t_439 = self.n_Conv_35(t_438)
+        t_440 = torch.add(t_439, t_434)
+        t_441 = F.relu(t_440)
+        t_442 = self.n_Conv_36(t_441)
+        t_443 = self.n_Conv_37(t_441)
+        t_444 = F.relu(t_443)
+        t_444_padded = F.pad(t_444, [0, 1, 0, 1], value=0)
+        t_445 = self.n_Conv_38(t_444_padded)
+        t_446 = F.relu(t_445)
+        t_447 = self.n_Conv_39(t_446)
+        t_448 = torch.add(t_447, t_442)
+        t_449 = F.relu(t_448)
+        t_450 = self.n_Conv_40(t_449)
+        t_451 = F.relu(t_450)
+        t_451_padded = F.pad(t_451, [1, 1, 1, 1], value=0)
+        t_452 = self.n_Conv_41(t_451_padded)
+        t_453 = F.relu(t_452)
+        t_454 = self.n_Conv_42(t_453)
+        t_455 = torch.add(t_454, t_449)
+        t_456 = F.relu(t_455)
+        t_457 = self.n_Conv_43(t_456)
+        t_458 = F.relu(t_457)
+        t_458_padded = F.pad(t_458, [1, 1, 1, 1], value=0)
+        t_459 = self.n_Conv_44(t_458_padded)
+        t_460 = F.relu(t_459)
+        t_461 = self.n_Conv_45(t_460)
+        t_462 = torch.add(t_461, t_456)
+        t_463 = F.relu(t_462)
+        t_464 = self.n_Conv_46(t_463)
+        t_465 = F.relu(t_464)
+        t_465_padded = F.pad(t_465, [1, 1, 1, 1], value=0)
+        t_466 = self.n_Conv_47(t_465_padded)
+        t_467 = F.relu(t_466)
+        t_468 = self.n_Conv_48(t_467)
+        t_469 = torch.add(t_468, t_463)
+        t_470 = F.relu(t_469)
+        t_471 = self.n_Conv_49(t_470)
+        t_472 = F.relu(t_471)
+        t_472_padded = F.pad(t_472, [1, 1, 1, 1], value=0)
+        t_473 = self.n_Conv_50(t_472_padded)
+        t_474 = F.relu(t_473)
+        t_475 = self.n_Conv_51(t_474)
+        t_476 = torch.add(t_475, t_470)
+        t_477 = F.relu(t_476)
+        t_478 = self.n_Conv_52(t_477)
+        t_479 = F.relu(t_478)
+        t_479_padded = F.pad(t_479, [1, 1, 1, 1], value=0)
+        t_480 = self.n_Conv_53(t_479_padded)
+        t_481 = F.relu(t_480)
+        t_482 = self.n_Conv_54(t_481)
+        t_483 = torch.add(t_482, t_477)
+        t_484 = F.relu(t_483)
+        t_485 = self.n_Conv_55(t_484)
+        t_486 = F.relu(t_485)
+        t_486_padded = F.pad(t_486, [1, 1, 1, 1], value=0)
+        t_487 = self.n_Conv_56(t_486_padded)
+        t_488 = F.relu(t_487)
+        t_489 = self.n_Conv_57(t_488)
+        t_490 = torch.add(t_489, t_484)
+        t_491 = F.relu(t_490)
+        t_492 = self.n_Conv_58(t_491)
+        t_493 = F.relu(t_492)
+        t_493_padded = F.pad(t_493, [1, 1, 1, 1], value=0)
+        t_494 = self.n_Conv_59(t_493_padded)
+        t_495 = F.relu(t_494)
+        t_496 = self.n_Conv_60(t_495)
+        t_497 = torch.add(t_496, t_491)
+        t_498 = F.relu(t_497)
+        t_499 = self.n_Conv_61(t_498)
+        t_500 = F.relu(t_499)
+        t_500_padded = F.pad(t_500, [1, 1, 1, 1], value=0)
+        t_501 = self.n_Conv_62(t_500_padded)
+        t_502 = F.relu(t_501)
+        t_503 = self.n_Conv_63(t_502)
+        t_504 = torch.add(t_503, t_498)
+        t_505 = F.relu(t_504)
+        t_506 = self.n_Conv_64(t_505)
+        t_507 = F.relu(t_506)
+        t_507_padded = F.pad(t_507, [1, 1, 1, 1], value=0)
+        t_508 = self.n_Conv_65(t_507_padded)
+        t_509 = F.relu(t_508)
+        t_510 = self.n_Conv_66(t_509)
+        t_511 = torch.add(t_510, t_505)
+        t_512 = F.relu(t_511)
+        t_513 = self.n_Conv_67(t_512)
+        t_514 = F.relu(t_513)
+        t_514_padded = F.pad(t_514, [1, 1, 1, 1], value=0)
+        t_515 = self.n_Conv_68(t_514_padded)
+        t_516 = F.relu(t_515)
+        t_517 = self.n_Conv_69(t_516)
+        t_518 = torch.add(t_517, t_512)
+        t_519 = F.relu(t_518)
+        t_520 = self.n_Conv_70(t_519)
+        t_521 = F.relu(t_520)
+        t_521_padded = F.pad(t_521, [1, 1, 1, 1], value=0)
+        t_522 = self.n_Conv_71(t_521_padded)
+        t_523 = F.relu(t_522)
+        t_524 = self.n_Conv_72(t_523)
+        t_525 = torch.add(t_524, t_519)
+        t_526 = F.relu(t_525)
+        t_527 = self.n_Conv_73(t_526)
+        t_528 = F.relu(t_527)
+        t_528_padded = F.pad(t_528, [1, 1, 1, 1], value=0)
+        t_529 = self.n_Conv_74(t_528_padded)
+        t_530 = F.relu(t_529)
+        t_531 = self.n_Conv_75(t_530)
+        t_532 = torch.add(t_531, t_526)
+        t_533 = F.relu(t_532)
+        t_534 = self.n_Conv_76(t_533)
+        t_535 = F.relu(t_534)
+        t_535_padded = F.pad(t_535, [1, 1, 1, 1], value=0)
+        t_536 = self.n_Conv_77(t_535_padded)
+        t_537 = F.relu(t_536)
+        t_538 = self.n_Conv_78(t_537)
+        t_539 = torch.add(t_538, t_533)
+        t_540 = F.relu(t_539)
+        t_541 = self.n_Conv_79(t_540)
+        t_542 = F.relu(t_541)
+        t_542_padded = F.pad(t_542, [1, 1, 1, 1], value=0)
+        t_543 = self.n_Conv_80(t_542_padded)
+        t_544 = F.relu(t_543)
+        t_545 = self.n_Conv_81(t_544)
+        t_546 = torch.add(t_545, t_540)
+        t_547 = F.relu(t_546)
+        t_548 = self.n_Conv_82(t_547)
+        t_549 = F.relu(t_548)
+        t_549_padded = F.pad(t_549, [1, 1, 1, 1], value=0)
+        t_550 = self.n_Conv_83(t_549_padded)
+        t_551 = F.relu(t_550)
+        t_552 = self.n_Conv_84(t_551)
+        t_553 = torch.add(t_552, t_547)
+        t_554 = F.relu(t_553)
+        t_555 = self.n_Conv_85(t_554)
+        t_556 = F.relu(t_555)
+        t_556_padded = F.pad(t_556, [1, 1, 1, 1], value=0)
+        t_557 = self.n_Conv_86(t_556_padded)
+        t_558 = F.relu(t_557)
+        t_559 = self.n_Conv_87(t_558)
+        t_560 = torch.add(t_559, t_554)
+        t_561 = F.relu(t_560)
+        t_562 = self.n_Conv_88(t_561)
+        t_563 = F.relu(t_562)
+        t_563_padded = F.pad(t_563, [1, 1, 1, 1], value=0)
+        t_564 = self.n_Conv_89(t_563_padded)
+        t_565 = F.relu(t_564)
+        t_566 = self.n_Conv_90(t_565)
+        t_567 = torch.add(t_566, t_561)
+        t_568 = F.relu(t_567)
+        t_569 = self.n_Conv_91(t_568)
+        t_570 = F.relu(t_569)
+        t_570_padded = F.pad(t_570, [1, 1, 1, 1], value=0)
+        t_571 = self.n_Conv_92(t_570_padded)
+        t_572 = F.relu(t_571)
+        t_573 = self.n_Conv_93(t_572)
+        t_574 = torch.add(t_573, t_568)
+        t_575 = F.relu(t_574)
+        t_576 = self.n_Conv_94(t_575)
+        t_577 = F.relu(t_576)
+        t_577_padded = F.pad(t_577, [1, 1, 1, 1], value=0)
+        t_578 = self.n_Conv_95(t_577_padded)
+        t_579 = F.relu(t_578)
+        t_580 = self.n_Conv_96(t_579)
+        t_581 = torch.add(t_580, t_575)
+        t_582 = F.relu(t_581)
+        t_583 = self.n_Conv_97(t_582)
+        t_584 = F.relu(t_583)
+        t_584_padded = F.pad(t_584, [0, 1, 0, 1], value=0)
+        t_585 = self.n_Conv_98(t_584_padded)
+        t_586 = F.relu(t_585)
+        t_587 = self.n_Conv_99(t_586)
+        t_588 = self.n_Conv_100(t_582)
+        t_589 = torch.add(t_587, t_588)
+        t_590 = F.relu(t_589)
+        t_591 = self.n_Conv_101(t_590)
+        t_592 = F.relu(t_591)
+        t_592_padded = F.pad(t_592, [1, 1, 1, 1], value=0)
+        t_593 = self.n_Conv_102(t_592_padded)
+        t_594 = F.relu(t_593)
+        t_595 = self.n_Conv_103(t_594)
+        t_596 = torch.add(t_595, t_590)
+        t_597 = F.relu(t_596)
+        t_598 = self.n_Conv_104(t_597)
+        t_599 = F.relu(t_598)
+        t_599_padded = F.pad(t_599, [1, 1, 1, 1], value=0)
+        t_600 = self.n_Conv_105(t_599_padded)
+        t_601 = F.relu(t_600)
+        t_602 = self.n_Conv_106(t_601)
+        t_603 = torch.add(t_602, t_597)
+        t_604 = F.relu(t_603)
+        t_605 = self.n_Conv_107(t_604)
+        t_606 = F.relu(t_605)
+        t_606_padded = F.pad(t_606, [1, 1, 1, 1], value=0)
+        t_607 = self.n_Conv_108(t_606_padded)
+        t_608 = F.relu(t_607)
+        t_609 = self.n_Conv_109(t_608)
+        t_610 = torch.add(t_609, t_604)
+        t_611 = F.relu(t_610)
+        t_612 = self.n_Conv_110(t_611)
+        t_613 = F.relu(t_612)
+        t_613_padded = F.pad(t_613, [1, 1, 1, 1], value=0)
+        t_614 = self.n_Conv_111(t_613_padded)
+        t_615 = F.relu(t_614)
+        t_616 = self.n_Conv_112(t_615)
+        t_617 = torch.add(t_616, t_611)
+        t_618 = F.relu(t_617)
+        t_619 = self.n_Conv_113(t_618)
+        t_620 = F.relu(t_619)
+        t_620_padded = F.pad(t_620, [1, 1, 1, 1], value=0)
+        t_621 = self.n_Conv_114(t_620_padded)
+        t_622 = F.relu(t_621)
+        t_623 = self.n_Conv_115(t_622)
+        t_624 = torch.add(t_623, t_618)
+        t_625 = F.relu(t_624)
+        t_626 = self.n_Conv_116(t_625)
+        t_627 = F.relu(t_626)
+        t_627_padded = F.pad(t_627, [1, 1, 1, 1], value=0)
+        t_628 = self.n_Conv_117(t_627_padded)
+        t_629 = F.relu(t_628)
+        t_630 = self.n_Conv_118(t_629)
+        t_631 = torch.add(t_630, t_625)
+        t_632 = F.relu(t_631)
+        t_633 = self.n_Conv_119(t_632)
+        t_634 = F.relu(t_633)
+        t_634_padded = F.pad(t_634, [1, 1, 1, 1], value=0)
+        t_635 = self.n_Conv_120(t_634_padded)
+        t_636 = F.relu(t_635)
+        t_637 = self.n_Conv_121(t_636)
+        t_638 = torch.add(t_637, t_632)
+        t_639 = F.relu(t_638)
+        t_640 = self.n_Conv_122(t_639)
+        t_641 = F.relu(t_640)
+        t_641_padded = F.pad(t_641, [1, 1, 1, 1], value=0)
+        t_642 = self.n_Conv_123(t_641_padded)
+        t_643 = F.relu(t_642)
+        t_644 = self.n_Conv_124(t_643)
+        t_645 = torch.add(t_644, t_639)
+        t_646 = F.relu(t_645)
+        t_647 = self.n_Conv_125(t_646)
+        t_648 = F.relu(t_647)
+        t_648_padded = F.pad(t_648, [1, 1, 1, 1], value=0)
+        t_649 = self.n_Conv_126(t_648_padded)
+        t_650 = F.relu(t_649)
+        t_651 = self.n_Conv_127(t_650)
+        t_652 = torch.add(t_651, t_646)
+        t_653 = F.relu(t_652)
+        t_654 = self.n_Conv_128(t_653)
+        t_655 = F.relu(t_654)
+        t_655_padded = F.pad(t_655, [1, 1, 1, 1], value=0)
+        t_656 = self.n_Conv_129(t_655_padded)
+        t_657 = F.relu(t_656)
+        t_658 = self.n_Conv_130(t_657)
+        t_659 = torch.add(t_658, t_653)
+        t_660 = F.relu(t_659)
+        t_661 = self.n_Conv_131(t_660)
+        t_662 = F.relu(t_661)
+        t_662_padded = F.pad(t_662, [1, 1, 1, 1], value=0)
+        t_663 = self.n_Conv_132(t_662_padded)
+        t_664 = F.relu(t_663)
+        t_665 = self.n_Conv_133(t_664)
+        t_666 = torch.add(t_665, t_660)
+        t_667 = F.relu(t_666)
+        t_668 = self.n_Conv_134(t_667)
+        t_669 = F.relu(t_668)
+        t_669_padded = F.pad(t_669, [1, 1, 1, 1], value=0)
+        t_670 = self.n_Conv_135(t_669_padded)
+        t_671 = F.relu(t_670)
+        t_672 = self.n_Conv_136(t_671)
+        t_673 = torch.add(t_672, t_667)
+        t_674 = F.relu(t_673)
+        t_675 = self.n_Conv_137(t_674)
+        t_676 = F.relu(t_675)
+        t_676_padded = F.pad(t_676, [1, 1, 1, 1], value=0)
+        t_677 = self.n_Conv_138(t_676_padded)
+        t_678 = F.relu(t_677)
+        t_679 = self.n_Conv_139(t_678)
+        t_680 = torch.add(t_679, t_674)
+        t_681 = F.relu(t_680)
+        t_682 = self.n_Conv_140(t_681)
+        t_683 = F.relu(t_682)
+        t_683_padded = F.pad(t_683, [1, 1, 1, 1], value=0)
+        t_684 = self.n_Conv_141(t_683_padded)
+        t_685 = F.relu(t_684)
+        t_686 = self.n_Conv_142(t_685)
+        t_687 = torch.add(t_686, t_681)
+        t_688 = F.relu(t_687)
+        t_689 = self.n_Conv_143(t_688)
+        t_690 = F.relu(t_689)
+        t_690_padded = F.pad(t_690, [1, 1, 1, 1], value=0)
+        t_691 = self.n_Conv_144(t_690_padded)
+        t_692 = F.relu(t_691)
+        t_693 = self.n_Conv_145(t_692)
+        t_694 = torch.add(t_693, t_688)
+        t_695 = F.relu(t_694)
+        t_696 = self.n_Conv_146(t_695)
+        t_697 = F.relu(t_696)
+        t_697_padded = F.pad(t_697, [1, 1, 1, 1], value=0)
+        t_698 = self.n_Conv_147(t_697_padded)
+        t_699 = F.relu(t_698)
+        t_700 = self.n_Conv_148(t_699)
+        t_701 = torch.add(t_700, t_695)
+        t_702 = F.relu(t_701)
+        t_703 = self.n_Conv_149(t_702)
+        t_704 = F.relu(t_703)
+        t_704_padded = F.pad(t_704, [1, 1, 1, 1], value=0)
+        t_705 = self.n_Conv_150(t_704_padded)
+        t_706 = F.relu(t_705)
+        t_707 = self.n_Conv_151(t_706)
+        t_708 = torch.add(t_707, t_702)
+        t_709 = F.relu(t_708)
+        t_710 = self.n_Conv_152(t_709)
+        t_711 = F.relu(t_710)
+        t_711_padded = F.pad(t_711, [1, 1, 1, 1], value=0)
+        t_712 = self.n_Conv_153(t_711_padded)
+        t_713 = F.relu(t_712)
+        t_714 = self.n_Conv_154(t_713)
+        t_715 = torch.add(t_714, t_709)
+        t_716 = F.relu(t_715)
+        t_717 = self.n_Conv_155(t_716)
+        t_718 = F.relu(t_717)
+        t_718_padded = F.pad(t_718, [1, 1, 1, 1], value=0)
+        t_719 = self.n_Conv_156(t_718_padded)
+        t_720 = F.relu(t_719)
+        t_721 = self.n_Conv_157(t_720)
+        t_722 = torch.add(t_721, t_716)
+        t_723 = F.relu(t_722)
+        t_724 = self.n_Conv_158(t_723)
+        t_725 = self.n_Conv_159(t_723)
+        t_726 = F.relu(t_725)
+        t_726_padded = F.pad(t_726, [0, 1, 0, 1], value=0)
+        t_727 = self.n_Conv_160(t_726_padded)
+        t_728 = F.relu(t_727)
+        t_729 = self.n_Conv_161(t_728)
+        t_730 = torch.add(t_729, t_724)
+        t_731 = F.relu(t_730)
+        t_732 = self.n_Conv_162(t_731)
+        t_733 = F.relu(t_732)
+        t_733_padded = F.pad(t_733, [1, 1, 1, 1], value=0)
+        t_734 = self.n_Conv_163(t_733_padded)
+        t_735 = F.relu(t_734)
+        t_736 = self.n_Conv_164(t_735)
+        t_737 = torch.add(t_736, t_731)
+        t_738 = F.relu(t_737)
+        t_739 = self.n_Conv_165(t_738)
+        t_740 = F.relu(t_739)
+        t_740_padded = F.pad(t_740, [1, 1, 1, 1], value=0)
+        t_741 = self.n_Conv_166(t_740_padded)
+        t_742 = F.relu(t_741)
+        t_743 = self.n_Conv_167(t_742)
+        t_744 = torch.add(t_743, t_738)
+        t_745 = F.relu(t_744)
+        t_746 = self.n_Conv_168(t_745)
+        t_747 = self.n_Conv_169(t_745)
+        t_748 = F.relu(t_747)
+        t_748_padded = F.pad(t_748, [0, 1, 0, 1], value=0)
+        t_749 = self.n_Conv_170(t_748_padded)
+        t_750 = F.relu(t_749)
+        t_751 = self.n_Conv_171(t_750)
+        t_752 = torch.add(t_751, t_746)
+        t_753 = F.relu(t_752)
+        t_754 = self.n_Conv_172(t_753)
+        t_755 = F.relu(t_754)
+        t_755_padded = F.pad(t_755, [1, 1, 1, 1], value=0)
+        t_756 = self.n_Conv_173(t_755_padded)
+        t_757 = F.relu(t_756)
+        t_758 = self.n_Conv_174(t_757)
+        t_759 = torch.add(t_758, t_753)
+        t_760 = F.relu(t_759)
+        t_761 = self.n_Conv_175(t_760)
+        t_762 = F.relu(t_761)
+        t_762_padded = F.pad(t_762, [1, 1, 1, 1], value=0)
+        t_763 = self.n_Conv_176(t_762_padded)
+        t_764 = F.relu(t_763)
+        t_765 = self.n_Conv_177(t_764)
+        t_766 = torch.add(t_765, t_760)
+        t_767 = F.relu(t_766)
+        t_768 = self.n_Conv_178(t_767)
+        t_769 = F.avg_pool2d(t_768, kernel_size=t_768.shape[-2:])
+        t_770 = torch.squeeze(t_769, 3)
+        t_770 = torch.squeeze(t_770, 2)
+        t_771 = torch.sigmoid(t_770)
+        return t_771
+
+    def load_state_dict(self, state_dict, **kwargs):
+        self.tags = state_dict.get('tags', [])
+
+        super(DeepDanbooruModel, self).load_state_dict({k: v for k, v in state_dict.items() if k != 'tags'})
+
--- a/modules/devices.py
+++ b/modules/devices.py
@ -2,72 +2,95 @@ import sys, os, shlex
 import contextlib
 import torch
 from modules import errors
+from packaging import version

-# has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
-has_mps = getattr(torch, 'has_mps', False)

-cpu = torch.device("cpu")
+# has_mps is only available in nightly pytorch (for now) and macOS 12.3+.
+# check `getattr` and try it for compatibility
+def has_mps() -> bool:
+    if not getattr(torch, 'has_mps', False):
+        return False
+    try:
+        torch.zeros(1).to(torch.device("mps"))
+        return True
+    except Exception:
+        return False
+

 def extract_device_id(args, name):
    for x in range(len(args)):
-        if name in args[x]: return args[x+1]
+        if name in args[x]:
+            return args[x + 1]
+
    return None

+
+def get_cuda_device_string():
+    from modules import shared
+
+    if shared.cmd_opts.device_id is not None:
+        return f"cuda:{shared.cmd_opts.device_id}"
+
+    return "cuda"
+
+
 def get_optimal_device():
    if torch.cuda.is_available():
-        from modules import shared
+        return torch.device(get_cuda_device_string())

-        device_id = shared.cmd_opts.device_id
-
-        if device_id is not None:
-            cuda_device = f"cuda:{device_id}"
-            return torch.device(cuda_device)
-        else:
-            return torch.device("cuda")
-
-    if has_mps:
+    if has_mps():
        return torch.device("mps")

    return cpu


+def get_device_for(task):
+    from modules import shared
+
+    if task in shared.cmd_opts.use_cpu:
+        return cpu
+
+    return get_optimal_device()
+
+
 def torch_gc():
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
+        with torch.cuda.device(get_cuda_device_string()):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()


 def enable_tf32():
    if torch.cuda.is_available():
+
+        # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
+        # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
+        if any([torch.cuda.get_device_capability(devid) == (7, 5) for devid in range(0, torch.cuda.device_count())]):
+            torch.backends.cudnn.benchmark = True
+
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True


+
 errors.run(enable_tf32, "Enabling TF32")

-device = device_interrogate = device_gfpgan = device_swinir = device_esrgan = device_scunet = device_codeformer = None
+cpu = torch.device("cpu")
+device = device_interrogate = device_gfpgan = device_esrgan = device_codeformer = None
 dtype = torch.float16
 dtype_vae = torch.float16

-def randn(seed, shape):
-    # Pytorch currently doesn't handle setting randomness correctly when the metal backend is used.
-    if device.type == 'mps':
-        generator = torch.Generator(device=cpu)
-        generator.manual_seed(seed)
-        noise = torch.randn(shape, generator=generator, device=cpu).to(device)
-        return noise

+def randn(seed, shape):
    torch.manual_seed(seed)
+    if device.type == 'mps':
+        return torch.randn(shape, device=cpu).to(device)
    return torch.randn(shape, device=device)


 def randn_without_seed(shape):
-    # Pytorch currently doesn't handle setting randomness correctly when the metal backend is used.
    if device.type == 'mps':
-        generator = torch.Generator(device=cpu)
-        noise = torch.randn(shape, generator=generator, device=cpu).to(device)
-        return noise
-
+        return torch.randn(shape, device=cpu).to(device)
    return torch.randn(shape, device=device)


@ -82,6 +105,36 @@ def autocast(disable=False):

    return torch.autocast("cuda")

+
 # MPS workaround for https://github.com/pytorch/pytorch/issues/79383
-def mps_contiguous(input_tensor, device): return input_tensor.contiguous() if device.type == 'mps' else input_tensor
-def mps_contiguous_to(input_tensor, device): return mps_contiguous(input_tensor, device).to(device)
+orig_tensor_to = torch.Tensor.to
+def tensor_to_fix(self, *args, **kwargs):
+    if self.device.type != 'mps' and \
+       ((len(args) > 0 and isinstance(args[0], torch.device) and args[0].type == 'mps') or \
+       (isinstance(kwargs.get('device'), torch.device) and kwargs['device'].type == 'mps')):
+        self = self.contiguous()
+    return orig_tensor_to(self, *args, **kwargs)
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/80800 
+orig_layer_norm = torch.nn.functional.layer_norm
+def layer_norm_fix(*args, **kwargs):
+    if len(args) > 0 and isinstance(args[0], torch.Tensor) and args[0].device.type == 'mps':
+        args = list(args)
+        args[0] = args[0].contiguous()
+    return orig_layer_norm(*args, **kwargs)
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/90532
+orig_tensor_numpy = torch.Tensor.numpy
+def numpy_fix(self, *args, **kwargs):
+    if self.requires_grad:
+        self = self.detach()
+    return orig_tensor_numpy(self, *args, **kwargs)
+
+
+# PyTorch 1.13 doesn't need these fixes but unfortunately is slower and has regressions that prevent training from working
+if has_mps() and version.parse(torch.__version__) < version.parse("1.13"):
+    torch.Tensor.to = tensor_to_fix
+    torch.nn.functional.layer_norm = layer_norm_fix
+    torch.Tensor.numpy = numpy_fix
--- a/modules/errors.py
+++ b/modules/errors.py
@ -2,9 +2,30 @@ import sys
 import traceback


+def print_error_explanation(message):
+    lines = message.strip().split("\n")
+    max_len = max([len(x) for x in lines])
+
+    print('=' * max_len, file=sys.stderr)
+    for line in lines:
+        print(line, file=sys.stderr)
+    print('=' * max_len, file=sys.stderr)
+
+
+def display(e: Exception, task):
+    print(f"{task or 'error'}: {type(e).__name__}", file=sys.stderr)
+    print(traceback.format_exc(), file=sys.stderr)
+
+    message = str(e)
+    if "copying a param with shape torch.Size([640, 1024]) from checkpoint, the shape in current model is torch.Size([640, 768])" in message:
+        print_error_explanation("""
+The most likely cause of this is you are trying to load Stable Diffusion 2.0 model without specifying its connfig file.
+See https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#stable-diffusion-20 for how to solve this.
+        """)
+
+
 def run(code, task):
    try:
        code()
    except Exception as e:
-        print(f"{task}: {type(e).__name__}", file=sys.stderr)
-        print(traceback.format_exc(), file=sys.stderr)
+        display(task, e)
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@ -199,7 +199,7 @@ def upscale_without_tiling(model, img):
    img = img[:, :, ::-1]
    img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))) / 255
    img = torch.from_numpy(img).float()
-    img = devices.mps_contiguous_to(img.unsqueeze(0), devices.device_esrgan)
+    img = img.unsqueeze(0).to(devices.device_esrgan)
    with torch.no_grad():
        output = model(img)
    output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
--- a/modules/extensions.py
+++ b/modules/extensions.py
@ -6,9 +6,9 @@ import git

 from modules import paths, shared

-
 extensions = []
 extensions_dir = os.path.join(paths.script_path, "extensions")
+extensions_builtin_dir = os.path.join(paths.script_path, "extensions-builtin")


 def active():
@ -16,12 +16,13 @@ def active():


 class Extension:
-    def __init__(self, name, path, enabled=True):
+    def __init__(self, name, path, enabled=True, is_builtin=False):
        self.name = name
        self.path = path
        self.enabled = enabled
        self.status = ''
        self.can_update = False
+        self.is_builtin = is_builtin

        repo = None
        try:
@ -66,9 +67,12 @@ class Extension:
        self.can_update = False
        self.status = "latest"

-    def pull(self):
+    def fetch_and_reset_hard(self):
        repo = git.Repo(self.path)
-        repo.remotes.origin.pull()
+        # Fix: `error: Your local changes to the following files would be overwritten by merge`,
+        # because WSL2 Docker set 755 file permissions instead of 644, this results to the error.
+        repo.git.fetch('--all')
+        repo.git.reset('--hard', 'origin')


 def list_extensions():
@ -77,10 +81,19 @@ def list_extensions():
    if not os.path.isdir(extensions_dir):
        return

-    for dirname in sorted(os.listdir(extensions_dir)):
-        path = os.path.join(extensions_dir, dirname)
-        if not os.path.isdir(path):
-            continue
+    paths = []
+    for dirname in [extensions_dir, extensions_builtin_dir]:
+        if not os.path.isdir(dirname):
+            return

-        extension = Extension(name=dirname, path=path, enabled=dirname not in shared.opts.disabled_extensions)
+        for extension_dirname in sorted(os.listdir(dirname)):
+            path = os.path.join(dirname, extension_dirname)
+            if not os.path.isdir(path):
+                continue
+
+            paths.append((extension_dirname, path, dirname == extensions_builtin_dir))
+
+    for dirname, path, is_builtin in paths:
+        extension = Extension(name=dirname, path=path, enabled=dirname not in shared.opts.disabled_extensions, is_builtin=is_builtin)
        extensions.append(extension)
+
--- a/modules/extras.py
+++ b/modules/extras.py
@ -1,6 +1,8 @@
 from __future__ import annotations
 import math
 import os
+import sys
+import traceback

 import numpy as np
 from PIL import Image
@ -12,15 +14,13 @@ from typing import Callable, List, OrderedDict, Tuple
 from functools import partial
 from dataclasses import dataclass

-from modules import processing, shared, images, devices, sd_models
+from modules import processing, shared, images, devices, sd_models, sd_samplers
 from modules.shared import opts
 import modules.gfpgan_model
 from modules.ui import plaintext_to_html
 import modules.codeformer_model
-import piexif
-import piexif.helper
 import gradio as gr
-
+import safetensors.torch

 class LruCache(OrderedDict):
    @dataclass(frozen=True)
@ -53,14 +53,17 @@ class LruCache(OrderedDict):
 cached_images: LruCache = LruCache(max_size=5)


-def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_dir, show_extras_results, gfpgan_visibility, codeformer_visibility, codeformer_weight, upscaling_resize, upscaling_resize_w, upscaling_resize_h, upscaling_crop, extras_upscaler_1, extras_upscaler_2, extras_upscaler_2_visibility, upscale_first: bool):
+def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_dir, show_extras_results, gfpgan_visibility, codeformer_visibility, codeformer_weight, upscaling_resize, upscaling_resize_w, upscaling_resize_h, upscaling_crop, extras_upscaler_1, extras_upscaler_2, extras_upscaler_2_visibility, upscale_first: bool, save_output: bool = True):
    devices.torch_gc()

+    shared.state.begin()
+    shared.state.job = 'extras'
+
    imageArr = []
    # Also keep track of original file names
    imageNameArr = []
    outputs = []
-    
+
    if extras_mode == 1:
        #convert file to pillow image
        for img in image_folder:
@ -92,6 +95,7 @@ def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_
    # Extra operation definitions

    def run_gfpgan(image: Image.Image, info: str) -> Tuple[Image.Image, str]:
+        shared.state.job = 'extras-gfpgan'
        restored_img = modules.gfpgan_model.gfpgan_fix_faces(np.array(image, dtype=np.uint8))
        res = Image.fromarray(restored_img)

@ -102,6 +106,7 @@ def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_
        return (res, info)

    def run_codeformer(image: Image.Image, info: str) -> Tuple[Image.Image, str]:
+        shared.state.job = 'extras-codeformer'
        restored_img = modules.codeformer_model.codeformer.restore(np.array(image, dtype=np.uint8), w=codeformer_weight)
        res = Image.fromarray(restored_img)

@ -112,6 +117,7 @@ def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_
        return (res, info)

    def upscale(image, scaler_index, resize, mode, resize_w, resize_h, crop):
+        shared.state.job = 'extras-upscale'
        upscaler = shared.sd_upscalers[scaler_index]
        res = upscaler.scaler.upscale(image, resize, upscaler.data_path)
        if mode == 1 and crop:
@ -178,6 +184,9 @@ def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_
    for image, image_name in zip(imageArr, imageNameArr):
        if image is None:
            return outputs, "Please select an input image.", ''
+
+        shared.state.textinfo = f'Processing image {image_name}'
+        
        existing_pnginfo = image.info or {}

        image = image.convert("RGB")
@ -186,18 +195,25 @@ def run_extras(extras_mode, resize_mode, image, image_folder, input_dir, output_
        for op in extras_ops:
            image, info = op(image, info)

-        if opts.use_original_name_batch and image_name != None:
+        if opts.use_original_name_batch and image_name is not None:
            basename = os.path.splitext(os.path.basename(image_name))[0]
        else:
            basename = ''

-        images.save_image(image, path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,
-                          no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=existing_pnginfo, forced_filename=None)
-
-        if opts.enable_pnginfo:
+        if opts.enable_pnginfo: # append info before save
            image.info = existing_pnginfo
            image.info["extras"] = info

+        if save_output:
+            # Add upscaler name as a suffix.
+            suffix = f"-{shared.sd_upscalers[extras_upscaler_1].name}" if shared.opts.use_upscaler_name_as_suffix else ""
+            # Add second upscaler if applicable.
+            if suffix and extras_upscaler_2 and extras_upscaler_2_visibility:
+                suffix += f"-{shared.sd_upscalers[extras_upscaler_2].name}"
+
+            images.save_image(image, path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,
+                            no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=existing_pnginfo, forced_filename=None, suffix=suffix)
+
        if extras_mode != 2 or show_extras_results :
            outputs.append(image)

@ -213,25 +229,8 @@ def run_pnginfo(image):
    if image is None:
        return '', '', ''

-    items = image.info
-    geninfo = ''
-
-    if "exif" in image.info:
-        exif = piexif.load(image.info["exif"])
-        exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b'')
-        try:
-            exif_comment = piexif.helper.UserComment.load(exif_comment)
-        except ValueError:
-            exif_comment = exif_comment.decode('utf8', errors="ignore")
-
-        items['exif comment'] = exif_comment
-        geninfo = exif_comment
-
-        for field in ['jfif', 'jfif_version', 'jfif_unit', 'jfif_density', 'dpi', 'exif',
-                      'loop', 'background', 'timestamp', 'duration']:
-            items.pop(field, None)
-
-    geninfo = items.get('parameters', geninfo)
+    geninfo, items = images.read_info_from_image(image)
+    items = {**{'parameters': geninfo}, **items}

    info = ''
    for key, text in items.items():
@ -249,7 +248,10 @@ def run_pnginfo(image):
    return '', geninfo, info


-def run_modelmerger(primary_model_name, secondary_model_name, teritary_model_name, interp_method, multiplier, save_as_half, custom_name):
+def run_modelmerger(primary_model_name, secondary_model_name, tertiary_model_name, interp_method, multiplier, save_as_half, custom_name, checkpoint_format):
+    shared.state.begin()
+    shared.state.job = 'model-merge'
+
    def weighted_sum(theta0, theta1, alpha):
        return ((1 - alpha) * theta0) + (alpha * theta1)

@ -261,23 +263,8 @@ def run_modelmerger(primary_model_name, secondary_model_name, teritary_model_nam

    primary_model_info = sd_models.checkpoints_list[primary_model_name]
    secondary_model_info = sd_models.checkpoints_list[secondary_model_name]
-    teritary_model_info = sd_models.checkpoints_list.get(teritary_model_name, None)
-
-    print(f"Loading {primary_model_info.filename}...")
-    primary_model = torch.load(primary_model_info.filename, map_location='cpu')
-    theta_0 = sd_models.get_state_dict_from_checkpoint(primary_model)
-
-    print(f"Loading {secondary_model_info.filename}...")
-    secondary_model = torch.load(secondary_model_info.filename, map_location='cpu')
-    theta_1 = sd_models.get_state_dict_from_checkpoint(secondary_model)
-
-    if teritary_model_info is not None:
-        print(f"Loading {teritary_model_info.filename}...")
-        teritary_model = torch.load(teritary_model_info.filename, map_location='cpu')
-        theta_2 = sd_models.get_state_dict_from_checkpoint(teritary_model)
-    else:
-        teritary_model = None
-        theta_2 = None
+    tertiary_model_info = sd_models.checkpoints_list.get(tertiary_model_name, None)
+    result_is_inpainting_model = False

    theta_funcs = {
        "Weighted sum": (None, weighted_sum),
@ -285,9 +272,19 @@ def run_modelmerger(primary_model_name, secondary_model_name, teritary_model_nam
    }
    theta_func1, theta_func2 = theta_funcs[interp_method]

-    print(f"Merging...")
+    if theta_func1 and not tertiary_model_info:
+        shared.state.textinfo = "Failed: Interpolation method requires a tertiary model."
+        shared.state.end()
+        return ["Failed: Interpolation method requires a tertiary model."] + [gr.Dropdown.update(choices=sd_models.checkpoint_tiles()) for _ in range(4)]
+
+    shared.state.textinfo = f"Loading {secondary_model_info.filename}..."
+    print(f"Loading {secondary_model_info.filename}...")
+    theta_1 = sd_models.read_state_dict(secondary_model_info.filename, map_location='cpu')

    if theta_func1:
+        print(f"Loading {tertiary_model_info.filename}...")
+        theta_2 = sd_models.read_state_dict(tertiary_model_info.filename, map_location='cpu')
+
        for key in tqdm.tqdm(theta_1.keys()):
            if 'model' in key:
                if key in theta_2:
@ -295,12 +292,33 @@ def run_modelmerger(primary_model_name, secondary_model_name, teritary_model_nam
                    theta_1[key] = theta_func1(theta_1[key], t2)
                else:
                    theta_1[key] = torch.zeros_like(theta_1[key])
-    del theta_2, teritary_model
+        del theta_2
+
+    shared.state.textinfo = f"Loading {primary_model_info.filename}..."
+    print(f"Loading {primary_model_info.filename}...")
+    theta_0 = sd_models.read_state_dict(primary_model_info.filename, map_location='cpu')
+
+    print("Merging...")

    for key in tqdm.tqdm(theta_0.keys()):
        if 'model' in key and key in theta_1:
+            a = theta_0[key]
+            b = theta_1[key]

-            theta_0[key] = theta_func2(theta_0[key], theta_1[key], multiplier)
+            shared.state.textinfo = f'Merging layer {key}'
+            # this enables merging an inpainting model (A) with another one (B);
+            # where normal model would have 4 channels, for latenst space, inpainting model would
+            # have another 4 channels for unmasked picture's latent space, plus one channel for mask, for a total of 9
+            if a.shape != b.shape and a.shape[0:1] + a.shape[2:] == b.shape[0:1] + b.shape[2:]:
+                if a.shape[1] == 4 and b.shape[1] == 9:
+                    raise RuntimeError("When merging inpainting model with a normal one, A must be the inpainting model.")
+
+                assert a.shape[1] == 9 and b.shape[1] == 4, f"Bad dimensions for merged layer {key}: A={a.shape}, B={b.shape}"
+
+                theta_0[key][:, 0:4, :, :] = theta_func2(a[:, 0:4, :, :], b, multiplier)
+                result_is_inpainting_model = True
+            else:
+                theta_0[key] = theta_func2(a, b, multiplier)

            if save_as_half:
                theta_0[key] = theta_0[key].half()
@ -311,17 +329,35 @@ def run_modelmerger(primary_model_name, secondary_model_name, teritary_model_nam
            theta_0[key] = theta_1[key]
            if save_as_half:
                theta_0[key] = theta_0[key].half()
+    del theta_1

    ckpt_dir = shared.cmd_opts.ckpt_dir or sd_models.model_path

-    filename = primary_model_info.model_name + '_' + str(round(1-multiplier, 2)) + '-' + secondary_model_info.model_name + '_' + str(round(multiplier, 2)) + '-' + interp_method.replace(" ", "_") + '-merged.ckpt'
-    filename = filename if custom_name == '' else (custom_name + '.ckpt')
+    filename = \
+        primary_model_info.model_name + '_' + str(round(1-multiplier, 2)) + '-' + \
+        secondary_model_info.model_name + '_' + str(round(multiplier, 2)) + '-' + \
+        interp_method.replace(" ", "_") + \
+        '-merged.' +  \
+        ("inpainting." if result_is_inpainting_model else "") + \
+        checkpoint_format
+
+    filename = filename if custom_name == '' else (custom_name + '.' + checkpoint_format)
+
    output_modelname = os.path.join(ckpt_dir, filename)

+    shared.state.textinfo = f"Saving to {output_modelname}..."
    print(f"Saving to {output_modelname}...")
-    torch.save(primary_model, output_modelname)
+
+    _, extension = os.path.splitext(output_modelname)
+    if extension.lower() == ".safetensors":
+        safetensors.torch.save_file(theta_0, output_modelname, metadata={"format": "pt"})
+    else:
+        torch.save(theta_0, output_modelname)

    sd_models.list_models()

-    print(f"Checkpoint saved.")
+    print("Checkpoint saved.")
+    shared.state.textinfo = "Checkpoint saved to " + output_modelname
+    shared.state.end()
+
    return ["Checkpoint saved to " + output_modelname] + [gr.Dropdown.update(choices=sd_models.checkpoint_tiles()) for _ in range(4)]
--- a/modules/generation_parameters_copypaste.py
+++ b/modules/generation_parameters_copypaste.py
@ -1,10 +1,13 @@
 import base64
 import io
+import math
 import os
 import re
+from pathlib import Path
+
 import gradio as gr
 from modules.shared import script_path
-from modules import shared
+from modules import shared, ui_tempdir
 import tempfile
 from PIL import Image

@ -12,6 +15,7 @@ re_param_code = r'\s*([\w ]+):\s*("(?:\\|\"|[^\"])+"|[^,]*)(?:,|$)'
 re_param = re.compile(re_param_code)
 re_params = re.compile(r"^(?:" + re_param_code + "){3,}$")
 re_imagesize = re.compile(r"^(\d+)x(\d+)$")
+re_hypernet_hash = re.compile("\(([0-9a-f]+)\)$")
 type_of_gr_update = type(gr.update())
 paste_fields = {}
 bind_list = []
@ -33,11 +37,13 @@ def quote(text):


 def image_from_url_text(filedata):
-    if type(filedata) == dict and filedata["is_file"]:
+    if type(filedata) == list and len(filedata) > 0 and type(filedata[0]) == dict and filedata[0].get("is_file", False):
+        filedata = filedata[0]
+
+    if type(filedata) == dict and filedata.get("is_file", False):
        filename = filedata["name"]
-        tempdir = os.path.normpath(tempfile.gettempdir())
-        normfn = os.path.normpath(filename)
-        assert normfn.startswith(tempdir), 'trying to open image file not in temporary directory'
+        is_in_right_dir = ui_tempdir.check_tmp_file(shared.demo, filename)
+        assert is_in_right_dir, 'trying to open image file outside of allowed directories'

        return Image.open(filename)

@ -73,7 +79,10 @@ def integrate_settings_paste_fields(component_dict):
        'sd_hypernetwork': 'Hypernet',
        'sd_hypernetwork_strength': 'Hypernet strength',
        'CLIP_stop_at_last_layers': 'Clip skip',
+        'inpainting_mask_weight': 'Conditional mask weight',
        'sd_model_checkpoint': 'Model hash',
+        'eta_noise_seed_delta': 'ENSD',
+        'initial_noise_multiplier': 'Noise multiplier',
    }
    settings_paste_fields = [
        (component_dict[k], lambda d, k=k, v=v: ui.apply_setting(k, d.get(v, None)))
@ -88,7 +97,7 @@ def integrate_settings_paste_fields(component_dict):
 def create_buttons(tabs_list):
    buttons = {}
    for tab in tabs_list:
-        buttons[tab] = gr.Button(f"Send to {tab}")
+        buttons[tab] = gr.Button(f"Send to {tab}", elem_id=f"{tab}_tab")
    return buttons


@ -97,36 +106,57 @@ def bind_buttons(buttons, send_image, send_generate_info):
    bind_list.append([buttons, send_image, send_generate_info])


+def send_image_and_dimensions(x):
+    if isinstance(x, Image.Image):
+        img = x
+    else:
+        img = image_from_url_text(x)
+
+    if shared.opts.send_size and isinstance(img, Image.Image):
+        w = img.width
+        h = img.height
+    else:
+        w = gr.update()
+        h = gr.update()
+
+    return img, w, h
+
+
 def run_bind():
-    for buttons, send_image, send_generate_info in bind_list:
+    for buttons, source_image_component, send_generate_info in bind_list:
        for tab in buttons:
            button = buttons[tab]
-            if send_image and paste_fields[tab]["init_img"]:
-                if type(send_image) == gr.Gallery:
-                    button.click(
-                        fn=lambda x: image_from_url_text(x),
-                        _js="extract_image_from_gallery",
-                        inputs=[send_image],
-                        outputs=[paste_fields[tab]["init_img"]],
-                    )
+            destination_image_component = paste_fields[tab]["init_img"]
+            fields = paste_fields[tab]["fields"]
+
+            destination_width_component = next(iter([field for field, name in fields if name == "Size-1"] if fields else []), None)
+            destination_height_component = next(iter([field for field, name in fields if name == "Size-2"] if fields else []), None)
+
+            if source_image_component and destination_image_component:
+                if isinstance(source_image_component, gr.Gallery):
+                    func = send_image_and_dimensions if destination_width_component else image_from_url_text
+                    jsfunc = "extract_image_from_gallery"
                else:
-                    button.click(
-                        fn=lambda x: x,
-                        inputs=[send_image],
-                        outputs=[paste_fields[tab]["init_img"]],
-                    )
+                    func = send_image_and_dimensions if destination_width_component else lambda x: x
+                    jsfunc = None

-            if send_generate_info and paste_fields[tab]["fields"] is not None:
+                button.click(
+                    fn=func,
+                    _js=jsfunc,
+                    inputs=[source_image_component],
+                    outputs=[destination_image_component, destination_width_component, destination_height_component] if destination_width_component else [destination_image_component],
+                )
+
+            if send_generate_info and fields is not None:
                if send_generate_info in paste_fields:
-                    paste_field_names = ['Prompt', 'Negative prompt', 'Steps', 'Face restoration', 'Size-1', 'Size-2'] + (["Seed"] if shared.opts.send_seed else [])
-
+                    paste_field_names = ['Prompt', 'Negative prompt', 'Steps', 'Face restoration'] + (["Seed"] if shared.opts.send_seed else [])
                    button.click(
                        fn=lambda *x: x,
                        inputs=[field for field, name in paste_fields[send_generate_info]["fields"] if name in paste_field_names],
-                        outputs=[field for field, name in paste_fields[tab]["fields"] if name in paste_field_names],
+                        outputs=[field for field, name in fields if name in paste_field_names],
                    )
                else:
-                    connect_paste(button, paste_fields[tab]["fields"], send_generate_info)
+                    connect_paste(button, fields, send_generate_info)

            button.click(
                fn=None,
@ -136,6 +166,59 @@ def run_bind():
            )


+def find_hypernetwork_key(hypernet_name, hypernet_hash=None):
+    """Determines the config parameter name to use for the hypernet based on the parameters in the infotext.
+
+    Example: an infotext provides "Hypernet: ke-ta" and "Hypernet hash: 1234abcd". For the "Hypernet" config
+    parameter this means there should be an entry that looks like "ke-ta-10000(1234abcd)" to set it to.
+
+    If the infotext has no hash, then a hypernet with the same name will be selected instead.
+    """
+    hypernet_name = hypernet_name.lower()
+    if hypernet_hash is not None:
+        # Try to match the hash in the name
+        for hypernet_key in shared.hypernetworks.keys():
+            result = re_hypernet_hash.search(hypernet_key)
+            if result is not None and result[1] == hypernet_hash:
+                return hypernet_key
+    else:
+        # Fall back to a hypernet with the same name
+        for hypernet_key in shared.hypernetworks.keys():
+            if hypernet_key.lower().startswith(hypernet_name):
+                return hypernet_key
+
+    return None
+
+
+def restore_old_hires_fix_params(res):
+    """for infotexts that specify old First pass size parameter, convert it into
+    width, height, and hr scale"""
+
+    firstpass_width = res.get('First pass size-1', None)
+    firstpass_height = res.get('First pass size-2', None)
+
+    if firstpass_width is None or firstpass_height is None:
+        return
+
+    firstpass_width, firstpass_height = int(firstpass_width), int(firstpass_height)
+    width = int(res.get("Size-1", 512))
+    height = int(res.get("Size-2", 512))
+
+    if firstpass_width == 0 or firstpass_height == 0:
+        # old algorithm for auto-calculating first pass size
+        desired_pixel_count = 512 * 512
+        actual_pixel_count = width * height
+        scale = math.sqrt(desired_pixel_count / actual_pixel_count)
+        firstpass_width = math.ceil(scale * width / 64) * 64
+        firstpass_height = math.ceil(scale * height / 64) * 64
+
+    hr_scale = width / firstpass_width if firstpass_width > 0 else height / firstpass_height
+
+    res['Size-1'] = firstpass_width
+    res['Size-2'] = firstpass_height
+    res['Hires upscale'] = hr_scale
+
+
 def parse_generation_parameters(x: str):
    """parses generation parameters string, the one you see in text field under the picture in UI:
 ```
@ -181,6 +264,20 @@ Steps: 20, Sampler: Euler a, CFG scale: 7, Seed: 965400086, Size: 512x512, Model
        else:
            res[k] = v

+    # Missing CLIP skip means it was set to 1 (the default)
+    if "Clip skip" not in res:
+        res["Clip skip"] = "1"
+
+    if "Hypernet strength" not in res:
+        res["Hypernet strength"] = "1"
+
+    if "Hypernet" in res:
+        hypernet_name = res["Hypernet"]
+        hypernet_hash = res.get("Hypernet hash", None)
+        res["Hypernet"] = find_hypernetwork_key(hypernet_name, hypernet_hash)
+
+    restore_old_hires_fix_params(res)
+
    return res


--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@ -36,7 +36,9 @@ def gfpgann():
    else:
        print("Unable to load gfpgan model!")
        return None
-    model = gfpgan_constructor(model_path=model_file, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None)
+    if hasattr(facexlib.detection.retinaface, 'device'):
+        facexlib.detection.retinaface.device = devices.device_gfpgan
+    model = gfpgan_constructor(model_path=model_file, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=devices.device_gfpgan)
    loaded_gfpgan_model = model

    return model
--- a/modules/hypernetworks/hypernetwork.py
+++ b/modules/hypernetworks/hypernetwork.py
@ -12,7 +12,7 @@ import torch
 import tqdm
 from einops import rearrange, repeat
 from ldm.util import default
-from modules import devices, processing, sd_models, shared
+from modules import devices, processing, sd_models, shared, sd_samplers
 from modules.textual_inversion import textual_inversion
 from modules.textual_inversion.learn_schedule import LearnRateScheduler
 from torch import einsum
@ -38,7 +38,7 @@ class HypernetworkModule(torch.nn.Module):
    activation_dict.update({cls_name.lower(): cls_obj for cls_name, cls_obj in inspect.getmembers(torch.nn.modules.activation) if inspect.isclass(cls_obj) and cls_obj.__module__ == 'torch.nn.modules.activation'})

    def __init__(self, dim, state_dict=None, layer_structure=None, activation_func=None, weight_init='Normal',
-                 add_layer_norm=False, use_dropout=False, activate_output=False, last_layer_dropout=True):
+                 add_layer_norm=False, use_dropout=False, activate_output=False, last_layer_dropout=False):
        super().__init__()

        assert layer_structure is not None, "layer_structure must not be None"
@ -154,16 +154,28 @@ class Hypernetwork:
                HypernetworkModule(size, None, self.layer_structure, self.activation_func, self.weight_init,
                                   self.add_layer_norm, self.use_dropout, self.activate_output, last_layer_dropout=self.last_layer_dropout),
            )
+        self.eval_mode()

    def weights(self):
        res = []
+        for k, layers in self.layers.items():
+            for layer in layers:
+                res += layer.parameters()
+        return res

+    def train_mode(self):
        for k, layers in self.layers.items():
            for layer in layers:
                layer.train()
-                res += layer.trainables()
+                for param in layer.parameters():
+                    param.requires_grad = True

-        return res
+    def eval_mode(self):
+        for k, layers in self.layers.items():
+            for layer in layers:
+                layer.eval()
+                for param in layer.parameters():
+                    param.requires_grad = False

    def save(self, filename):
        state_dict = {}
@ -265,7 +277,7 @@ def load_hypernetwork(filename):
            print(traceback.format_exc(), file=sys.stderr)
    else:
        if shared.loaded_hypernetwork is not None:
-            print(f"Unloading hypernetwork")
+            print("Unloading hypernetwork")

        shared.loaded_hypernetwork = None

@ -366,19 +378,44 @@ def report_statistics(loss_info:dict):
            print(e)


+def create_hypernetwork(name, enable_sizes, overwrite_old, layer_structure=None, activation_func=None, weight_init=None, add_layer_norm=False, use_dropout=False):
+    # Remove illegal characters from name.
+    name = "".join( x for x in name if (x.isalnum() or x in "._- "))

-def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log_directory, training_width, training_height, steps, clip_grad_mode, clip_grad_value, create_image_every, save_hypernetwork_every, template_file, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
+    fn = os.path.join(shared.cmd_opts.hypernetwork_dir, f"{name}.pt")
+    if not overwrite_old:
+        assert not os.path.exists(fn), f"file {fn} already exists"
+
+    if type(layer_structure) == str:
+        layer_structure = [float(x.strip()) for x in layer_structure.split(",")]
+
+    hypernet = modules.hypernetworks.hypernetwork.Hypernetwork(
+        name=name,
+        enable_sizes=[int(x) for x in enable_sizes],
+        layer_structure=layer_structure,
+        activation_func=activation_func,
+        weight_init=weight_init,
+        add_layer_norm=add_layer_norm,
+        use_dropout=use_dropout,
+    )
+    hypernet.save(fn)
+
+    shared.reload_hypernetworks()
+
+
+def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, steps, clip_grad_mode, clip_grad_value, shuffle_tags, tag_drop_out, latent_sampling_method, create_image_every, save_hypernetwork_every, template_file, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
    # images allows training previews to have infotext. Importing it at the top causes a circular import problem.
    from modules import images

    save_hypernetwork_every = save_hypernetwork_every or 0
    create_image_every = create_image_every or 0
-    textual_inversion.validate_train_inputs(hypernetwork_name, learn_rate, batch_size, data_root, template_file, steps, save_hypernetwork_every, create_image_every, log_directory, name="hypernetwork")
+    textual_inversion.validate_train_inputs(hypernetwork_name, learn_rate, batch_size, gradient_step, data_root, template_file, steps, save_hypernetwork_every, create_image_every, log_directory, name="hypernetwork")

    path = shared.hypernetworks.get(hypernetwork_name, None)
    shared.loaded_hypernetwork = Hypernetwork()
    shared.loaded_hypernetwork.load(path)

+    shared.state.job = "train-hypernetwork"
    shared.state.textinfo = "Initializing hypernetwork training..."
    shared.state.job_count = steps

@ -403,38 +440,37 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
    hypernetwork = shared.loaded_hypernetwork
    checkpoint = sd_models.select_checkpoint()

-    ititial_step = hypernetwork.step or 0
-    if ititial_step >= steps:
-        shared.state.textinfo = f"Model has already been trained beyond specified max steps"
+    initial_step = hypernetwork.step or 0
+    if initial_step >= steps:
+        shared.state.textinfo = "Model has already been trained beyond specified max steps"
        return hypernetwork, filename

-    scheduler = LearnRateScheduler(learn_rate, steps, ititial_step)
+    scheduler = LearnRateScheduler(learn_rate, steps, initial_step)
    
-    clip_grad = torch.nn.utils.clip_grad_value_ if clip_grad_mode == "value" else \
-        torch.nn.utils.clip_grad_norm_ if clip_grad_mode == "norm" else \
-        None
+    clip_grad = torch.nn.utils.clip_grad_value_ if clip_grad_mode == "value" else torch.nn.utils.clip_grad_norm_ if clip_grad_mode == "norm" else None
    if clip_grad:
-        clip_grad_sched = LearnRateScheduler(clip_grad_value, steps, ititial_step, verbose=False)
-    
+        clip_grad_sched = LearnRateScheduler(clip_grad_value, steps, initial_step, verbose=False)
+
    # dataset loading may take a while, so input validations and early returns should be done before this
    shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
-    with torch.autocast("cuda"):
-        ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=hypernetwork_name, model=shared.sd_model, device=devices.device, template_file=template_file, include_cond=True, batch_size=batch_size)
+
+    pin_memory = shared.opts.pin_memory
+
+    ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=hypernetwork_name, model=shared.sd_model, cond_model=shared.sd_model.cond_stage_model, device=devices.device, template_file=template_file, include_cond=True, batch_size=batch_size, gradient_step=gradient_step, shuffle_tags=shuffle_tags, tag_drop_out=tag_drop_out, latent_sampling_method=latent_sampling_method)
+    
+    latent_sampling_method = ds.latent_sampling_method
+
+    dl = modules.textual_inversion.dataset.PersonalizedDataLoader(ds, latent_sampling_method=latent_sampling_method, batch_size=ds.batch_size, pin_memory=pin_memory)
+
+    old_parallel_processing_allowed = shared.parallel_processing_allowed

    if unload:
+        shared.parallel_processing_allowed = False
        shared.sd_model.cond_stage_model.to(devices.cpu)
        shared.sd_model.first_stage_model.to(devices.cpu)

-    size = len(ds.indexes)
-    loss_dict = defaultdict(lambda : deque(maxlen = 1024))
-    losses = torch.zeros((size,))
-    previous_mean_losses = [0]
-    previous_mean_loss = 0
-    print("Mean loss of {} elements".format(size))
-
    weights = hypernetwork.weights()
-    for weight in weights:
-        weight.requires_grad = True
+    hypernetwork.train_mode()

    # Here we use optimizer from saved HN, or we can specify as UI option.
    if hypernetwork.optimizer_name in optimizer_dict:
@ -452,145 +488,172 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
            print("Cannot resume from saved optimizer!")
            print(e)

+    scaler = torch.cuda.amp.GradScaler()
+    
+    batch_size = ds.batch_size
+    gradient_step = ds.gradient_step
+    # n steps = batch_size * gradient_step * n image processed
+    steps_per_epoch = len(ds) // batch_size // gradient_step
+    max_steps_per_epoch = len(ds) // batch_size - (len(ds) // batch_size) % gradient_step
+    loss_step = 0
+    _loss_step = 0 #internal
+    # size = len(ds.indexes)
+    # loss_dict = defaultdict(lambda : deque(maxlen = 1024))
+    # losses = torch.zeros((size,))
+    # previous_mean_losses = [0]
+    # previous_mean_loss = 0
+    # print("Mean loss of {} elements".format(size))
+
    steps_without_grad = 0

    last_saved_file = "<none>"
    last_saved_image = "<none>"
    forced_filename = "<none>"

-    pbar = tqdm.tqdm(enumerate(ds), total=steps - ititial_step)
-    for i, entries in pbar:
-        hypernetwork.step = i + ititial_step
-        if len(loss_dict) > 0:
-            previous_mean_losses = [i[-1] for i in loss_dict.values()]
-            previous_mean_loss = mean(previous_mean_losses)
-            
-        scheduler.apply(optimizer, hypernetwork.step)
-        if scheduler.finished:
-            break
+    pbar = tqdm.tqdm(total=steps - initial_step)
+    try:
+        for i in range((steps-initial_step) * gradient_step):
+            if scheduler.finished:
+                break
+            if shared.state.interrupted:
+                break
+            for j, batch in enumerate(dl):
+                # works as a drop_last=True for gradient accumulation
+                if j == max_steps_per_epoch:
+                    break
+                scheduler.apply(optimizer, hypernetwork.step)
+                if scheduler.finished:
+                    break
+                if shared.state.interrupted:
+                    break

-        if shared.state.interrupted:
-            break
-
-        if clip_grad:
-            clip_grad_sched.step(hypernetwork.step)
-
-        with torch.autocast("cuda"):
-            c = stack_conds([entry.cond for entry in entries]).to(devices.device)
-            # c = torch.vstack([entry.cond for entry in entries]).to(devices.device)
-            x = torch.stack([entry.latent for entry in entries]).to(devices.device)
-            loss = shared.sd_model(x, c)[0]
-            del x
-            del c
-
-            losses[hypernetwork.step % losses.shape[0]] = loss.item()
-            for entry in entries:
-                loss_dict[entry.filename].append(loss.item())
+                if clip_grad:
+                    clip_grad_sched.step(hypernetwork.step)
                
-            optimizer.zero_grad()
-            weights[0].grad = None
-            loss.backward()
+                with devices.autocast():
+                    x = batch.latent_sample.to(devices.device, non_blocking=pin_memory)
+                    if tag_drop_out != 0 or shuffle_tags:
+                        shared.sd_model.cond_stage_model.to(devices.device)
+                        c = shared.sd_model.cond_stage_model(batch.cond_text).to(devices.device, non_blocking=pin_memory)
+                        shared.sd_model.cond_stage_model.to(devices.cpu)
+                    else:
+                        c = stack_conds(batch.cond).to(devices.device, non_blocking=pin_memory)
+                    loss = shared.sd_model(x, c)[0] / gradient_step
+                    del x
+                    del c

-            if weights[0].grad is None:
-                steps_without_grad += 1
-            else:
-                steps_without_grad = 0
-            assert steps_without_grad < 10, 'no gradient found for the trained weight after backward() for 10 steps in a row; this is a bug; training cannot continue'
+                    _loss_step += loss.item()
+                scaler.scale(loss).backward()
+                
+                # go back until we reach gradient accumulation steps
+                if (j + 1) % gradient_step != 0:
+                    continue

-            if clip_grad:
-                clip_grad(weights, clip_grad_sched.learn_rate)
+                if clip_grad:
+                    clip_grad(weights, clip_grad_sched.learn_rate)
+                
+                scaler.step(optimizer)
+                scaler.update()
+                hypernetwork.step += 1
+                pbar.update()
+                optimizer.zero_grad(set_to_none=True)
+                loss_step = _loss_step
+                _loss_step = 0

-            optimizer.step()
+                steps_done = hypernetwork.step + 1
+                
+                epoch_num = hypernetwork.step // steps_per_epoch
+                epoch_step = hypernetwork.step % steps_per_epoch

-        steps_done = hypernetwork.step + 1
+                pbar.set_description(f"[Epoch {epoch_num}: {epoch_step+1}/{steps_per_epoch}]loss: {loss_step:.7f}")
+                if hypernetwork_dir is not None and steps_done % save_hypernetwork_every == 0:
+                    # Before saving, change name to match current checkpoint.
+                    hypernetwork_name_every = f'{hypernetwork_name}-{steps_done}'
+                    last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name_every}.pt')
+                    hypernetwork.optimizer_name = optimizer_name
+                    if shared.opts.save_optimizer_state:
+                        hypernetwork.optimizer_state_dict = optimizer.state_dict()
+                    save_hypernetwork(hypernetwork, checkpoint, hypernetwork_name, last_saved_file)
+                    hypernetwork.optimizer_state_dict = None  # dereference it after saving, to save memory.

-        if torch.isnan(losses[hypernetwork.step % losses.shape[0]]): 
-            raise RuntimeError("Loss diverged.")
-        
-        if len(previous_mean_losses) > 1:
-            std = stdev(previous_mean_losses)
-        else:
-            std = 0
-        dataset_loss_info = f"dataset loss:{mean(previous_mean_losses):.3f}" + u"\u00B1" + f"({std / (len(previous_mean_losses) ** 0.5):.3f})"
-        pbar.set_description(dataset_loss_info)
+                textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, steps_per_epoch, {
+                    "loss": f"{loss_step:.7f}",
+                    "learn_rate": scheduler.learn_rate
+                })

-        if hypernetwork_dir is not None and steps_done % save_hypernetwork_every == 0:
-            # Before saving, change name to match current checkpoint.
-            hypernetwork_name_every = f'{hypernetwork_name}-{steps_done}'
-            last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name_every}.pt')
-            hypernetwork.optimizer_name = optimizer_name
-            if shared.opts.save_optimizer_state:
-                hypernetwork.optimizer_state_dict = optimizer.state_dict()
-            save_hypernetwork(hypernetwork, checkpoint, hypernetwork_name, last_saved_file)
-            hypernetwork.optimizer_state_dict = None  # dereference it after saving, to save memory.
+                if images_dir is not None and steps_done % create_image_every == 0:
+                    forced_filename = f'{hypernetwork_name}-{steps_done}'
+                    last_saved_image = os.path.join(images_dir, forced_filename)
+                    hypernetwork.eval_mode()
+                    shared.sd_model.cond_stage_model.to(devices.device)
+                    shared.sd_model.first_stage_model.to(devices.device)

-        textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, len(ds), {
-            "loss": f"{previous_mean_loss:.7f}",
-            "learn_rate": scheduler.learn_rate
-        })
+                    p = processing.StableDiffusionProcessingTxt2Img(
+                        sd_model=shared.sd_model,
+                        do_not_save_grid=True,
+                        do_not_save_samples=True,
+                    )

-        if images_dir is not None and steps_done % create_image_every == 0:
-            forced_filename = f'{hypernetwork_name}-{steps_done}'
-            last_saved_image = os.path.join(images_dir, forced_filename)
+                    if preview_from_txt2img:
+                        p.prompt = preview_prompt
+                        p.negative_prompt = preview_negative_prompt
+                        p.steps = preview_steps
+                        p.sampler_name = sd_samplers.samplers[preview_sampler_index].name
+                        p.cfg_scale = preview_cfg_scale
+                        p.seed = preview_seed
+                        p.width = preview_width
+                        p.height = preview_height
+                    else:
+                        p.prompt = batch.cond_text[0]
+                        p.steps = 20
+                        p.width = training_width
+                        p.height = training_height

-            optimizer.zero_grad()
-            shared.sd_model.cond_stage_model.to(devices.device)
-            shared.sd_model.first_stage_model.to(devices.device)
+                    preview_text = p.prompt

-            p = processing.StableDiffusionProcessingTxt2Img(
-                sd_model=shared.sd_model,
-                do_not_save_grid=True,
-                do_not_save_samples=True,
-            )
+                    processed = processing.process_images(p)
+                    image = processed.images[0] if len(processed.images) > 0 else None

-            if preview_from_txt2img:
-                p.prompt = preview_prompt
-                p.negative_prompt = preview_negative_prompt
-                p.steps = preview_steps
-                p.sampler_index = preview_sampler_index
-                p.cfg_scale = preview_cfg_scale
-                p.seed = preview_seed
-                p.width = preview_width
-                p.height = preview_height
-            else:
-                p.prompt = entries[0].cond_text
-                p.steps = 20
+                    if unload:
+                        shared.sd_model.cond_stage_model.to(devices.cpu)
+                        shared.sd_model.first_stage_model.to(devices.cpu)
+                    hypernetwork.train_mode()
+                    if image is not None:
+                        shared.state.current_image = image
+                        last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
+                        last_saved_image += f", prompt: {preview_text}"

-            preview_text = p.prompt
+                shared.state.job_no = hypernetwork.step

-            processed = processing.process_images(p)
-            image = processed.images[0] if len(processed.images)>0 else None
-
-            if unload:
-                shared.sd_model.cond_stage_model.to(devices.cpu)
-                shared.sd_model.first_stage_model.to(devices.cpu)
-
-            if image is not None:
-                shared.state.current_image = image
-                last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
-                last_saved_image += f", prompt: {preview_text}"
-
-        shared.state.job_no = hypernetwork.step
-
-        shared.state.textinfo = f"""
+                shared.state.textinfo = f"""
 <p>
-Loss: {previous_mean_loss:.7f}<br/>
-Step: {hypernetwork.step}<br/>
-Last prompt: {html.escape(entries[0].cond_text)}<br/>
+Loss: {loss_step:.7f}<br/>
+Step: {steps_done}<br/>
+Last prompt: {html.escape(batch.cond_text[0])}<br/>
 Last saved hypernetwork: {html.escape(last_saved_file)}<br/>
 Last saved image: {html.escape(last_saved_image)}<br/>
 </p>
 """
-        
-    report_statistics(loss_dict)
+    except Exception:
+        print(traceback.format_exc(), file=sys.stderr)
+    finally:
+        pbar.leave = False
+        pbar.close()
+        hypernetwork.eval_mode()
+        #report_statistics(loss_dict)

    filename = os.path.join(shared.cmd_opts.hypernetwork_dir, f'{hypernetwork_name}.pt')
    hypernetwork.optimizer_name = optimizer_name
    if shared.opts.save_optimizer_state:
        hypernetwork.optimizer_state_dict = optimizer.state_dict()
    save_hypernetwork(hypernetwork, checkpoint, hypernetwork_name, filename)
+
    del optimizer
    hypernetwork.optimizer_state_dict = None  # dereference it after saving, to save memory.
+    shared.sd_model.cond_stage_model.to(devices.device)
+    shared.sd_model.first_stage_model.to(devices.device)
+    shared.parallel_processing_allowed = old_parallel_processing_allowed
+
    return hypernetwork, filename

 def save_hypernetwork(hypernetwork, checkpoint, hypernetwork_name, filename):
--- a/modules/hypernetworks/ui.py
+++ b/modules/hypernetworks/ui.py
@ -3,39 +3,16 @@ import os
 import re

 import gradio as gr
-import modules.textual_inversion.preprocess
-import modules.textual_inversion.textual_inversion
+import modules.hypernetworks.hypernetwork
 from modules import devices, sd_hijack, shared
-from modules.hypernetworks import hypernetwork

 not_available = ["hardswish", "multiheadattention"]
-keys = list(x for x in hypernetwork.HypernetworkModule.activation_dict.keys() if x not in not_available)
+keys = list(x for x in modules.hypernetworks.hypernetwork.HypernetworkModule.activation_dict.keys() if x not in not_available)

 def create_hypernetwork(name, enable_sizes, overwrite_old, layer_structure=None, activation_func=None, weight_init=None, add_layer_norm=False, use_dropout=False):
-    # Remove illegal characters from name.
-    name = "".join( x for x in name if (x.isalnum() or x in "._- "))
+    filename = modules.hypernetworks.hypernetwork.create_hypernetwork(name, enable_sizes, overwrite_old, layer_structure, activation_func, weight_init, add_layer_norm, use_dropout)

-    fn = os.path.join(shared.cmd_opts.hypernetwork_dir, f"{name}.pt")
-    if not overwrite_old:
-        assert not os.path.exists(fn), f"file {fn} already exists"
-
-    if type(layer_structure) == str:
-        layer_structure = [float(x.strip()) for x in layer_structure.split(",")]
-
-    hypernet = modules.hypernetworks.hypernetwork.Hypernetwork(
-        name=name,
-        enable_sizes=[int(x) for x in enable_sizes],
-        layer_structure=layer_structure,
-        activation_func=activation_func,
-        weight_init=weight_init,
-        add_layer_norm=add_layer_norm,
-        use_dropout=use_dropout,
-    )
-    hypernet.save(fn)
-
-    shared.reload_hypernetworks()
-
-    return gr.Dropdown.update(choices=sorted([x for x in shared.hypernetworks.keys()])), f"Created: {fn}", ""
+    return gr.Dropdown.update(choices=sorted([x for x in shared.hypernetworks.keys()])), f"Created: {filename}", ""


 def train_hypernetwork(*args):
--- a/modules/images.py
+++ b/modules/images.py
@ -15,6 +15,7 @@ import piexif.helper
 from PIL import Image, ImageFont, ImageDraw, PngImagePlugin
 from fonts.ttf import Roboto
 import string
+import json

 from modules import sd_samplers, shared, script_callbacks
 from modules.shared import opts, cmd_opts
@ -38,11 +39,14 @@ def image_grid(imgs, batch_size=1, rows=None):

    cols = math.ceil(len(imgs) / rows)

-    w, h = imgs[0].size
-    grid = Image.new('RGB', size=(cols * w, rows * h), color='black')
+    params = script_callbacks.ImageGridLoopParams(imgs, cols, rows)
+    script_callbacks.image_grid_callback(params)

-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
+    w, h = imgs[0].size
+    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color='black')
+
+    for i, img in enumerate(params.imgs):
+        grid.paste(img, box=(i % params.cols * w, i // params.cols * h))

    return grid

@ -135,8 +139,19 @@ def draw_grid_annotations(im, width, height, hor_texts, ver_texts):
                lines.append(word)
        return lines

-    def draw_texts(drawing, draw_x, draw_y, lines):
+    def get_font(fontsize):
+        try:
+            return ImageFont.truetype(opts.font or Roboto, fontsize)
+        except Exception:
+            return ImageFont.truetype(Roboto, fontsize)
+
+    def draw_texts(drawing, draw_x, draw_y, lines, initial_fnt, initial_fontsize):
        for i, line in enumerate(lines):
+            fnt = initial_fnt
+            fontsize = initial_fontsize
+            while drawing.multiline_textsize(line.text, font=fnt)[0] > line.allowed_width and fontsize > 0:
+                fontsize -= 1
+                fnt = get_font(fontsize)
            drawing.multiline_text((draw_x, draw_y + line.size[1] / 2), line.text, font=fnt, fill=color_active if line.is_active else color_inactive, anchor="mm", align="center")

            if not line.is_active:
@ -147,10 +162,7 @@ def draw_grid_annotations(im, width, height, hor_texts, ver_texts):
    fontsize = (width + height) // 25
    line_spacing = fontsize // 2

-    try:
-        fnt = ImageFont.truetype(opts.font or Roboto, fontsize)
-    except Exception:
-        fnt = ImageFont.truetype(Roboto, fontsize)
+    fnt = get_font(fontsize)

    color_active = (0, 0, 0)
    color_inactive = (153, 153, 153)
@ -177,6 +189,7 @@ def draw_grid_annotations(im, width, height, hor_texts, ver_texts):
        for line in texts:
            bbox = calc_d.multiline_textbbox((0, 0), line.text, font=fnt)
            line.size = (bbox[2] - bbox[0], bbox[3] - bbox[1])
+            line.allowed_width = allowed_width

    hor_text_heights = [sum([line.size[1] + line_spacing for line in lines]) - line_spacing for lines in hor_texts]
    ver_text_heights = [sum([line.size[1] + line_spacing for line in lines]) - line_spacing * len(lines) for lines in
@ -193,13 +206,13 @@ def draw_grid_annotations(im, width, height, hor_texts, ver_texts):
        x = pad_left + width * col + width / 2
        y = pad_top / 2 - hor_text_heights[col] / 2

-        draw_texts(d, x, y, hor_texts[col])
+        draw_texts(d, x, y, hor_texts[col], fnt, fontsize)

    for row in range(rows):
        x = pad_left / 2
        y = pad_top + height * row + height / 2 - ver_text_heights[row] / 2

-        draw_texts(d, x, y, ver_texts[row])
+        draw_texts(d, x, y, ver_texts[row], fnt, fontsize)

    return result

@ -217,16 +230,32 @@ def draw_prompt_matrix(im, width, height, all_prompts):
    return draw_grid_annotations(im, width, height, hor_texts, ver_texts)


-def resize_image(resize_mode, im, width, height):
+def resize_image(resize_mode, im, width, height, upscaler_name=None):
+    """
+    Resizes an image with the specified resize_mode, width, and height.
+
+    Args:
+        resize_mode: The mode to use when resizing the image.
+            0: Resize the image to the specified width and height.
+            1: Resize the image to fill the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
+            2: Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
+        im: The image to resize.
+        width: The width to resize the image to.
+        height: The height to resize the image to.
+        upscaler_name: The name of the upscaler to use. If not provided, defaults to opts.upscaler_for_img2img.
+    """
+
+    upscaler_name = upscaler_name or opts.upscaler_for_img2img
+
    def resize(im, w, h):
-        if opts.upscaler_for_img2img is None or opts.upscaler_for_img2img == "None" or im.mode == 'L':
+        if upscaler_name is None or upscaler_name == "None" or im.mode == 'L':
            return im.resize((w, h), resample=LANCZOS)

        scale = max(w / im.width, h / im.height)

        if scale > 1.0:
-            upscalers = [x for x in shared.sd_upscalers if x.name == opts.upscaler_for_img2img]
-            assert len(upscalers) > 0, f"could not find upscaler named {opts.upscaler_for_img2img}"
+            upscalers = [x for x in shared.sd_upscalers if x.name == upscaler_name]
+            assert len(upscalers) > 0, f"could not find upscaler named {upscaler_name}"

            upscaler = upscalers[0]
            im = upscaler.scaler.upscale(im, scale, upscaler.data_path)
@ -303,8 +332,9 @@ class FilenameGenerator:
        'width': lambda self: self.image.width,
        'height': lambda self: self.image.height,
        'styles': lambda self: self.p and sanitize_filename_part(", ".join([style for style in self.p.styles if not style == "None"]) or "None", replace_spaces=False),
-        'sampler': lambda self: self.p and sanitize_filename_part(sd_samplers.samplers[self.p.sampler_index].name, replace_spaces=False),
+        'sampler': lambda self: self.p and sanitize_filename_part(self.p.sampler_name, replace_spaces=False),
        'model_hash': lambda self: getattr(self.p, "sd_model_hash", shared.sd_model.sd_model_hash),
+        'model_name': lambda self: sanitize_filename_part(shared.sd_model.sd_checkpoint_info.model_name, replace_spaces=False),
        'date': lambda self: datetime.datetime.now().strftime('%Y-%m-%d'),
        'datetime': lambda self, *args: self.datetime(*args),  # accepts formats: [datetime], [datetime<Format>], [datetime<Format><Time Zone>]
        'job_timestamp': lambda self: getattr(self.p, "job_timestamp", shared.state.job_timestamp),
@ -427,7 +457,7 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
            The directory to save the image. Note, the option `save_to_dirs` will make the image to be saved into a sub directory.
        basename (`str`):
            The base filename which will be applied to `filename pattern`.
-        seed, prompt, short_filename, 
+        seed, prompt, short_filename,
        extension (`str`):
            Image file extension, default is `png`.
        pngsectionname (`str`):
@ -499,30 +529,44 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
    image = params.image
    fullfn = params.filename
    info = params.pnginfo.get(pnginfo_section_name, None)
+
+    def _atomically_save_image(image_to_save, filename_without_extension, extension):
+        # save image with .tmp extension to avoid race condition when another process detects new image in the directory
+        temp_file_path = filename_without_extension + ".tmp"
+        image_format = Image.registered_extensions()[extension]
+
+        if extension.lower() == '.png':
+            pnginfo_data = PngImagePlugin.PngInfo()
+            if opts.enable_pnginfo:
+                for k, v in params.pnginfo.items():
+                    pnginfo_data.add_text(k, str(v))
+
+            image_to_save.save(temp_file_path, format=image_format, quality=opts.jpeg_quality, pnginfo=pnginfo_data)
+
+        elif extension.lower() in (".jpg", ".jpeg", ".webp"):
+            if image_to_save.mode == 'RGBA':
+                image_to_save = image_to_save.convert("RGB")
+
+            image_to_save.save(temp_file_path, format=image_format, quality=opts.jpeg_quality)
+
+            if opts.enable_pnginfo and info is not None:
+                exif_bytes = piexif.dump({
+                    "Exif": {
+                        piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(info or "", encoding="unicode")
+                    },
+                })
+
+                piexif.insert(exif_bytes, temp_file_path)
+        else:
+            image_to_save.save(temp_file_path, format=image_format, quality=opts.jpeg_quality)
+
+        # atomically rename the file with correct extension
+        os.replace(temp_file_path, filename_without_extension + extension)
+
    fullfn_without_extension, extension = os.path.splitext(params.filename)
+    _atomically_save_image(image, fullfn_without_extension, extension)

-    def exif_bytes():
-        return piexif.dump({
-            "Exif": {
-                piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(info or "", encoding="unicode")
-            },
-        })
-
-    if extension.lower() == '.png':
-        pnginfo_data = PngImagePlugin.PngInfo()
-        if opts.enable_pnginfo:
-            for k, v in params.pnginfo.items():
-                pnginfo_data.add_text(k, str(v))
-
-        image.save(fullfn, quality=opts.jpeg_quality, pnginfo=pnginfo_data)
-
-    elif extension.lower() in (".jpg", ".jpeg", ".webp"):
-        image.save(fullfn, quality=opts.jpeg_quality)
-
-        if opts.enable_pnginfo and info is not None:
-            piexif.insert(exif_bytes(), fullfn)
-    else:
-        image.save(fullfn, quality=opts.jpeg_quality)
+    image.already_saved_as = fullfn

    target_side_length = 4000
    oversize = image.width > target_side_length or image.height > target_side_length
@ -534,9 +578,7 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
        elif oversize:
            image = image.resize((image.width * target_side_length // image.height, target_side_length), LANCZOS)

-        image.save(fullfn_without_extension + ".jpg", quality=opts.jpeg_quality)
-        if opts.enable_pnginfo and info is not None:
-            piexif.insert(exif_bytes(), fullfn_without_extension + ".jpg")
+        _atomically_save_image(image, fullfn_without_extension, ".jpg")

    if opts.save_txt and info is not None:
        txt_fullfn = f"{fullfn_without_extension}.txt"
@ -550,10 +592,45 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
    return fullfn, txt_fullfn


+def read_info_from_image(image):
+    items = image.info or {}
+
+    geninfo = items.pop('parameters', None)
+
+    if "exif" in items:
+        exif = piexif.load(items["exif"])
+        exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b'')
+        try:
+            exif_comment = piexif.helper.UserComment.load(exif_comment)
+        except ValueError:
+            exif_comment = exif_comment.decode('utf8', errors="ignore")
+
+        items['exif comment'] = exif_comment
+        geninfo = exif_comment
+
+        for field in ['jfif', 'jfif_version', 'jfif_unit', 'jfif_density', 'dpi', 'exif',
+                      'loop', 'background', 'timestamp', 'duration']:
+            items.pop(field, None)
+
+    if items.get("Software", None) == "NovelAI":
+        try:
+            json_info = json.loads(items["Comment"])
+            sampler = sd_samplers.samplers_map.get(json_info["sampler"], "Euler a")
+
+            geninfo = f"""{items["Description"]}
+Negative prompt: {json_info["uc"]}
+Steps: {json_info["steps"]}, Sampler: {sampler}, CFG scale: {json_info["scale"]}, Seed: {json_info["seed"]}, Size: {image.width}x{image.height}, Clip skip: 2, ENSD: 31337"""
+        except Exception:
+            print("Error parsing NovelAI image generation parameters:", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+
+    return geninfo, items
+
+
 def image_data(data):
    try:
        image = Image.open(io.BytesIO(data))
-        textinfo = image.text["parameters"]
+        textinfo, _ = read_info_from_image(image)
        return textinfo, None
    except Exception:
        pass
@ -567,3 +644,14 @@ def image_data(data):
        pass

    return '', None
+
+
+def flatten(img, bgcolor):
+    """replaces transparency with bgcolor (example: "#ffffff"), returning an RGB mode image with no transparency"""
+
+    if img.mode == "RGBA":
+        background = Image.new('RGBA', img.size, bgcolor)
+        background.paste(img, mask=img)
+        img = background
+
+    return img.convert('RGB')
--- a/modules/img2img.py
+++ b/modules/img2img.py
@ -4,9 +4,9 @@ import sys
 import traceback

 import numpy as np
-from PIL import Image, ImageOps, ImageChops
+from PIL import Image, ImageOps, ImageFilter, ImageEnhance, ImageChops

-from modules import devices
+from modules import devices, sd_samplers
 from modules.processing import Processed, StableDiffusionProcessingImg2Img, process_images
 from modules.shared import opts, state
 import modules.shared as shared
@ -40,7 +40,7 @@ def process_batch(p, input_dir, output_dir, args):

        img = Image.open(image)
        # Use the EXIF orientation of photos taken by smartphones.
-        img = ImageOps.exif_transpose(img) 
+        img = ImageOps.exif_transpose(img)
        p.init_images = [img] * p.batch_size

        proc = modules.scripts.scripts_img2img.run(p, *args)
@ -59,18 +59,31 @@ def process_batch(p, input_dir, output_dir, args):
                processed_image.save(os.path.join(output_dir, filename))


-def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, init_img, init_img_with_mask, init_img_inpaint, init_mask_inpaint, mask_mode, steps: int, sampler_index: int, mask_blur: int, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, *args):
+def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, init_img, init_img_with_mask, init_img_with_mask_orig, init_img_inpaint, init_mask_inpaint, mask_mode, steps: int, sampler_index: int, mask_blur: int, mask_alpha: float, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, *args):
    is_inpaint = mode == 1
    is_batch = mode == 2

    if is_inpaint:
        # Drawn mask
        if mask_mode == 0:
-            image = init_img_with_mask['image']
-            mask = init_img_with_mask['mask']
-            alpha_mask = ImageOps.invert(image.split()[-1]).convert('L').point(lambda x: 255 if x > 0 else 0, mode='1')
-            mask = ImageChops.lighter(alpha_mask, mask.convert('L')).convert('L')
-            image = image.convert('RGB')
+            is_mask_sketch = isinstance(init_img_with_mask, dict)
+            is_mask_paint = not is_mask_sketch
+            if is_mask_sketch:
+                # Sketch: mask iff. not transparent
+                image, mask = init_img_with_mask["image"], init_img_with_mask["mask"]
+                alpha_mask = ImageOps.invert(image.split()[-1]).convert('L').point(lambda x: 255 if x > 0 else 0, mode='1')
+                mask = ImageChops.lighter(alpha_mask, mask.convert('L')).convert('L')
+            else:
+                # Color-sketch: mask iff. painted over
+                image = init_img_with_mask
+                orig = init_img_with_mask_orig or init_img_with_mask
+                pred = np.any(np.array(image) != np.array(orig), axis=-1)
+                mask = Image.fromarray(pred.astype(np.uint8) * 255, "L")
+                mask = ImageEnhance.Brightness(mask).enhance(1 - mask_alpha / 100)
+                blur = ImageFilter.GaussianBlur(mask_blur)
+                image = Image.composite(image.filter(blur), orig, mask.filter(blur))
+
+            image = image.convert("RGB")
        # Uploaded mask
        else:
            image = init_img_inpaint
@ -82,7 +95,7 @@ def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, pro

    # Use the EXIF orientation of photos taken by smartphones.
    if image is not None:
-        image = ImageOps.exif_transpose(image) 
+        image = ImageOps.exif_transpose(image)

    assert 0. <= denoising_strength <= 1., 'can only work with strength in [0.0, 1.0]'

@ -99,7 +112,7 @@ def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, pro
        seed_resize_from_h=seed_resize_from_h,
        seed_resize_from_w=seed_resize_from_w,
        seed_enable_extras=seed_enable_extras,
-        sampler_index=sampler_index,
+        sampler_name=sd_samplers.samplers_for_img2img[sampler_index].name,
        batch_size=batch_size,
        n_iter=n_iter,
        steps=steps,
@ -149,4 +162,4 @@ def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, pro
    if opts.do_not_show_images:
        processed.images = []

-    return processed.images, generation_info_js, plaintext_to_html(processed.info)
+    return processed.images, generation_info_js, plaintext_to_html(processed.info), plaintext_to_html(processed.comments)
--- a/modules/import_hook.py
+++ b/modules/import_hook.py
@ -0,0 +1,5 @@
+import sys
+
+# this will break any attempt to import xformers which will prevent stability diffusion repo from trying to use it
+if "--xformers" not in "".join(sys.argv):
+    sys.modules["xformers"] = None
--- a/modules/interrogate.py
+++ b/modules/interrogate.py
@ -1,4 +1,3 @@
-import contextlib
 import os
 import sys
 import traceback
@ -11,10 +10,9 @@ from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode

 import modules.shared as shared
-from modules import devices, paths, lowvram
+from modules import devices, paths, lowvram, modelloader

 blip_image_eval_size = 384
-blip_model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
 clip_model_name = 'ViT-L/14'

 Category = namedtuple("Category", ["name", "topn", "items"])
@ -47,7 +45,14 @@ class InterrogateModels:
    def load_blip_model(self):
        import models.blip

-        blip_model = models.blip.blip_decoder(pretrained=blip_model_url, image_size=blip_image_eval_size, vit='base', med_config=os.path.join(paths.paths["BLIP"], "configs", "med_config.json"))
+        files = modelloader.load_models(
+            model_path=os.path.join(paths.models_path, "BLIP"),
+            model_url='https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth',
+            ext_filter=[".pth"],
+            download_name='model_base_caption_capfilt_large.pth',
+        )
+
+        blip_model = models.blip.blip_decoder(pretrained=files[0], image_size=blip_image_eval_size, vit='base', med_config=os.path.join(paths.paths["BLIP"], "configs", "med_config.json"))
        blip_model.eval()

        return blip_model
@ -130,8 +135,9 @@ class InterrogateModels:
        return caption[0]

    def interrogate(self, pil_image):
-        res = None
-
+        res = ""
+        shared.state.begin()
+        shared.state.job = 'interrogate'
        try:

            if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
@ -148,8 +154,7 @@ class InterrogateModels:

            clip_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)

-            precision_scope = torch.autocast if shared.cmd_opts.precision == "autocast" else contextlib.nullcontext
-            with torch.no_grad(), precision_scope("cuda"):
+            with torch.no_grad(), devices.autocast():
                image_features = self.clip_model.encode_image(clip_image).type(self.dtype)

                image_features /= image_features.norm(dim=-1, keepdim=True)
@ -168,10 +173,11 @@ class InterrogateModels:
                            res += ", " + match

        except Exception:
-            print(f"Error interrogating", file=sys.stderr)
+            print("Error interrogating", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
            res += "<error>"

        self.unload()
+        shared.state.end()

        return res
--- a/modules/ldsr_model.py
+++ b/modules/ldsr_model.py
@ -1,54 +0,0 @@
-import os
-import sys
-import traceback
-
-from basicsr.utils.download_util import load_file_from_url
-
-from modules.upscaler import Upscaler, UpscalerData
-from modules.ldsr_model_arch import LDSR
-from modules import shared
-
-
-class UpscalerLDSR(Upscaler):
-    def __init__(self, user_path):
-        self.name = "LDSR"
-        self.user_path = user_path
-        self.model_url = "https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1"
-        self.yaml_url = "https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1"
-        super().__init__()
-        scaler_data = UpscalerData("LDSR", None, self)
-        self.scalers = [scaler_data]
-
-    def load_model(self, path: str):
-        # Remove incorrect project.yaml file if too big
-        yaml_path = os.path.join(self.model_path, "project.yaml")
-        old_model_path = os.path.join(self.model_path, "model.pth")
-        new_model_path = os.path.join(self.model_path, "model.ckpt")
-        if os.path.exists(yaml_path):
-            statinfo = os.stat(yaml_path)
-            if statinfo.st_size >= 10485760:
-                print("Removing invalid LDSR YAML file.")
-                os.remove(yaml_path)
-        if os.path.exists(old_model_path):
-            print("Renaming model from model.pth to model.ckpt")
-            os.rename(old_model_path, new_model_path)
-        model = load_file_from_url(url=self.model_url, model_dir=self.model_path,
-                                   file_name="model.ckpt", progress=True)
-        yaml = load_file_from_url(url=self.yaml_url, model_dir=self.model_path,
-                                  file_name="project.yaml", progress=True)
-
-        try:
-            return LDSR(model, yaml)
-
-        except Exception:
-            print("Error importing LDSR:", file=sys.stderr)
-            print(traceback.format_exc(), file=sys.stderr)
-        return None
-
-    def do_upscale(self, img, path):
-        ldsr = self.load_model(path)
-        if ldsr is None:
-            print("NO LDSR!")
-            return img
-        ddim_steps = shared.opts.ldsr_steps
-        return ldsr.super_resolution(img, ddim_steps, self.scale)
--- a/modules/ldsr_model_arch.py
+++ b/modules/ldsr_model_arch.py
@ -1,230 +0,0 @@
-import gc
-import time
-import warnings
-
-import numpy as np
-import torch
-import torchvision
-from PIL import Image
-from einops import rearrange, repeat
-from omegaconf import OmegaConf
-
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.util import instantiate_from_config, ismap
-
-warnings.filterwarnings("ignore", category=UserWarning)
-
-
-# Create LDSR Class
-class LDSR:
-    def load_model_from_config(self, half_attention):
-        print(f"Loading model from {self.modelPath}")
-        pl_sd = torch.load(self.modelPath, map_location="cpu")
-        sd = pl_sd["state_dict"]
-        config = OmegaConf.load(self.yamlPath)
-        model = instantiate_from_config(config.model)
-        model.load_state_dict(sd, strict=False)
-        model.cuda()
-        if half_attention:
-            model = model.half()
-
-        model.eval()
-        return {"model": model}
-
-    def __init__(self, model_path, yaml_path):
-        self.modelPath = model_path
-        self.yamlPath = yaml_path
-
-    @staticmethod
-    def run(model, selected_path, custom_steps, eta):
-        example = get_cond(selected_path)
-
-        n_runs = 1
-        guider = None
-        ckwargs = None
-        ddim_use_x0_pred = False
-        temperature = 1.
-        eta = eta
-        custom_shape = None
-
-        height, width = example["image"].shape[1:3]
-        split_input = height >= 128 and width >= 128
-
-        if split_input:
-            ks = 128
-            stride = 64
-            vqf = 4  #
-            model.split_input_params = {"ks": (ks, ks), "stride": (stride, stride),
-                                        "vqf": vqf,
-                                        "patch_distributed_vq": True,
-                                        "tie_braker": False,
-                                        "clip_max_weight": 0.5,
-                                        "clip_min_weight": 0.01,
-                                        "clip_max_tie_weight": 0.5,
-                                        "clip_min_tie_weight": 0.01}
-        else:
-            if hasattr(model, "split_input_params"):
-                delattr(model, "split_input_params")
-
-        x_t = None
-        logs = None
-        for n in range(n_runs):
-            if custom_shape is not None:
-                x_t = torch.randn(1, custom_shape[1], custom_shape[2], custom_shape[3]).to(model.device)
-                x_t = repeat(x_t, '1 c h w -> b c h w', b=custom_shape[0])
-
-            logs = make_convolutional_sample(example, model,
-                                             custom_steps=custom_steps,
-                                             eta=eta, quantize_x0=False,
-                                             custom_shape=custom_shape,
-                                             temperature=temperature, noise_dropout=0.,
-                                             corrector=guider, corrector_kwargs=ckwargs, x_T=x_t,
-                                             ddim_use_x0_pred=ddim_use_x0_pred
-                                             )
-        return logs
-
-    def super_resolution(self, image, steps=100, target_scale=2, half_attention=False):
-        model = self.load_model_from_config(half_attention)
-
-        # Run settings
-        diffusion_steps = int(steps)
-        eta = 1.0
-
-        down_sample_method = 'Lanczos'
-
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        im_og = image
-        width_og, height_og = im_og.size
-        # If we can adjust the max upscale size, then the 4 below should be our variable
-        down_sample_rate = target_scale / 4
-        wd = width_og * down_sample_rate
-        hd = height_og * down_sample_rate
-        width_downsampled_pre = int(np.ceil(wd))
-        height_downsampled_pre = int(np.ceil(hd))
-
-        if down_sample_rate != 1:
-            print(
-                f'Downsampling from [{width_og}, {height_og}] to [{width_downsampled_pre}, {height_downsampled_pre}]')
-            im_og = im_og.resize((width_downsampled_pre, height_downsampled_pre), Image.LANCZOS)
-        else:
-            print(f"Down sample rate is 1 from {target_scale} / 4 (Not downsampling)")
-        
-        # pad width and height to multiples of 64, pads with the edge values of image to avoid artifacts
-        pad_w, pad_h = np.max(((2, 2), np.ceil(np.array(im_og.size) / 64).astype(int)), axis=0) * 64 - im_og.size
-        im_padded = Image.fromarray(np.pad(np.array(im_og), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
-        
-        logs = self.run(model["model"], im_padded, diffusion_steps, eta)
-
-        sample = logs["sample"]
-        sample = sample.detach().cpu()
-        sample = torch.clamp(sample, -1., 1.)
-        sample = (sample + 1.) / 2. * 255
-        sample = sample.numpy().astype(np.uint8)
-        sample = np.transpose(sample, (0, 2, 3, 1))
-        a = Image.fromarray(sample[0])
-
-        # remove padding
-        a = a.crop((0, 0) + tuple(np.array(im_og.size) * 4))
-
-        del model
-        gc.collect()
-        torch.cuda.empty_cache()
-        return a
-
-
-def get_cond(selected_path):
-    example = dict()
-    up_f = 4
-    c = selected_path.convert('RGB')
-    c = torch.unsqueeze(torchvision.transforms.ToTensor()(c), 0)
-    c_up = torchvision.transforms.functional.resize(c, size=[up_f * c.shape[2], up_f * c.shape[3]],
-                                                    antialias=True)
-    c_up = rearrange(c_up, '1 c h w -> 1 h w c')
-    c = rearrange(c, '1 c h w -> 1 h w c')
-    c = 2. * c - 1.
-
-    c = c.to(torch.device("cuda"))
-    example["LR_image"] = c
-    example["image"] = c_up
-
-    return example
-
-
-@torch.no_grad()
-def convsample_ddim(model, cond, steps, shape, eta=1.0, callback=None, normals_sequence=None,
-                    mask=None, x0=None, quantize_x0=False, temperature=1., score_corrector=None,
-                    corrector_kwargs=None, x_t=None
-                    ):
-    ddim = DDIMSampler(model)
-    bs = shape[0]
-    shape = shape[1:]
-    print(f"Sampling with eta = {eta}; steps: {steps}")
-    samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, conditioning=cond, callback=callback,
-                                         normals_sequence=normals_sequence, quantize_x0=quantize_x0, eta=eta,
-                                         mask=mask, x0=x0, temperature=temperature, verbose=False,
-                                         score_corrector=score_corrector,
-                                         corrector_kwargs=corrector_kwargs, x_t=x_t)
-
-    return samples, intermediates
-
-
-@torch.no_grad()
-def make_convolutional_sample(batch, model, custom_steps=None, eta=1.0, quantize_x0=False, custom_shape=None, temperature=1., noise_dropout=0., corrector=None,
-                              corrector_kwargs=None, x_T=None, ddim_use_x0_pred=False):
-    log = dict()
-
-    z, c, x, xrec, xc = model.get_input(batch, model.first_stage_key,
-                                        return_first_stage_outputs=True,
-                                        force_c_encode=not (hasattr(model, 'split_input_params')
-                                                            and model.cond_stage_key == 'coordinates_bbox'),
-                                        return_original_cond=True)
-
-    if custom_shape is not None:
-        z = torch.randn(custom_shape)
-        print(f"Generating {custom_shape[0]} samples of shape {custom_shape[1:]}")
-
-    z0 = None
-
-    log["input"] = x
-    log["reconstruction"] = xrec
-
-    if ismap(xc):
-        log["original_conditioning"] = model.to_rgb(xc)
-        if hasattr(model, 'cond_stage_key'):
-            log[model.cond_stage_key] = model.to_rgb(xc)
-
-    else:
-        log["original_conditioning"] = xc if xc is not None else torch.zeros_like(x)
-        if model.cond_stage_model:
-            log[model.cond_stage_key] = xc if xc is not None else torch.zeros_like(x)
-            if model.cond_stage_key == 'class_label':
-                log[model.cond_stage_key] = xc[model.cond_stage_key]
-
-    with model.ema_scope("Plotting"):
-        t0 = time.time()
-
-        sample, intermediates = convsample_ddim(model, c, steps=custom_steps, shape=z.shape,
-                                                eta=eta,
-                                                quantize_x0=quantize_x0, mask=None, x0=z0,
-                                                temperature=temperature, score_corrector=corrector, corrector_kwargs=corrector_kwargs,
-                                                x_t=x_T)
-        t1 = time.time()
-
-        if ddim_use_x0_pred:
-            sample = intermediates['pred_x0'][-1]
-
-    x_sample = model.decode_first_stage(sample)
-
-    try:
-        x_sample_noquant = model.decode_first_stage(sample, force_not_quantize=True)
-        log["sample_noquant"] = x_sample_noquant
-        log["sample_diff"] = torch.abs(x_sample_noquant - x_sample)
-    except:
-        pass
-
-    log["sample"] = x_sample
-    log["time"] = t1 - t0
-
-    return log
--- a/modules/lowvram.py
+++ b/modules/lowvram.py
@ -51,20 +51,30 @@ def setup_for_low_vram(sd_model, use_medvram):
        send_me_to_gpu(first_stage_model, None)
        return first_stage_model_decode(z)

-    # remove three big modules, cond, first_stage, and unet from the model and then
-    # send the model to GPU. Then put modules back. the modules will be in CPU.
-    stored = sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model
-    sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model = None, None, None
-    sd_model.to(devices.device)
-    sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model = stored
+    # for SD1, cond_stage_model is CLIP and its NN is in the tranformer frield, but for SD2, it's open clip, and it's in model field
+    if hasattr(sd_model.cond_stage_model, 'model'):
+        sd_model.cond_stage_model.transformer = sd_model.cond_stage_model.model

-    # register hooks for those the first two models
+    # remove four big modules, cond, first_stage, depth (if applicable), and unet from the model and then
+    # send the model to GPU. Then put modules back. the modules will be in CPU.
+    stored = sd_model.cond_stage_model.transformer, sd_model.first_stage_model, getattr(sd_model, 'depth_model', None), sd_model.model
+    sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.depth_model, sd_model.model = None, None, None, None
+    sd_model.to(devices.device)
+    sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.depth_model, sd_model.model = stored
+
+    # register hooks for those the first three models
    sd_model.cond_stage_model.transformer.register_forward_pre_hook(send_me_to_gpu)
    sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
    sd_model.first_stage_model.encode = first_stage_model_encode_wrap
    sd_model.first_stage_model.decode = first_stage_model_decode_wrap
+    if sd_model.depth_model:
+        sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
    parents[sd_model.cond_stage_model.transformer] = sd_model.cond_stage_model

+    if hasattr(sd_model.cond_stage_model, 'model'):
+        sd_model.cond_stage_model.model = sd_model.cond_stage_model.transformer
+        del sd_model.cond_stage_model.transformer
+
    if use_medvram:
        sd_model.model.register_forward_pre_hook(send_me_to_gpu)
    else:
--- a/modules/memmon.py
+++ b/modules/memmon.py
@ -71,10 +71,13 @@ class MemUsageMonitor(threading.Thread):
    def read(self):
        if not self.disabled:
            free, total = torch.cuda.mem_get_info()
+            self.data["free"] = free
            self.data["total"] = total

            torch_stats = torch.cuda.memory_stats(self.device)
+            self.data["active"] = torch_stats["active.all.current"]
            self.data["active_peak"] = torch_stats["active_bytes.all.peak"]
+            self.data["reserved"] = torch_stats["reserved_bytes.all.current"]
            self.data["reserved_peak"] = torch_stats["reserved_bytes.all.peak"]
            self.data["system_peak"] = total - self.data["min_free"]

--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@ -82,6 +82,7 @@ def cleanup_models():
    src_path = models_path
    dest_path = os.path.join(models_path, "Stable-diffusion")
    move_files(src_path, dest_path, ".ckpt")
+    move_files(src_path, dest_path, ".safetensors")
    src_path = os.path.join(root_path, "ESRGAN")
    dest_path = os.path.join(models_path, "ESRGAN")
    move_files(src_path, dest_path)
@ -122,11 +123,27 @@ def move_files(src_path: str, dest_path: str, ext_filter: str = None):
        pass


+builtin_upscaler_classes = []
+forbidden_upscaler_classes = set()
+
+
+def list_builtin_upscalers():
+    load_upscalers()
+
+    builtin_upscaler_classes.clear()
+    builtin_upscaler_classes.extend(Upscaler.__subclasses__())
+
+
+def forbid_loaded_nonbuiltin_upscalers():
+    for cls in Upscaler.__subclasses__():
+        if cls not in builtin_upscaler_classes:
+            forbidden_upscaler_classes.add(cls)
+
+
 def load_upscalers():
-    sd = shared.script_path
    # We can only do this 'magic' method to dynamically load upscalers if they are referenced,
    # so we'll try to import any _model.py files before looking in __subclasses__
-    modules_dir = os.path.join(sd, "modules")
+    modules_dir = os.path.join(shared.script_path, "modules")
    for file in os.listdir(modules_dir):
        if "_model.py" in file:
            model_name = file.replace("_model.py", "")
@ -135,22 +152,16 @@ def load_upscalers():
                importlib.import_module(full_model)
            except:
                pass
+
    datas = []
-    c_o = vars(shared.cmd_opts)
+    commandline_options = vars(shared.cmd_opts)
    for cls in Upscaler.__subclasses__():
+        if cls in forbidden_upscaler_classes:
+            continue
+
        name = cls.__name__
-        module_name = cls.__module__
-        module = importlib.import_module(module_name)
-        class_ = getattr(module, name)
        cmd_name = f"{name.lower().replace('upscaler', '')}_models_path"
-        opt_string = None
-        try:
-            if cmd_name in c_o:
-                opt_string = c_o[cmd_name]
-        except:
-            pass
-        scaler = class_(opt_string)
-        for child in scaler.scalers:
-            datas.append(child)
+        scaler = cls(commandline_options.get(cmd_name, None))
+        datas += scaler.scalers

    shared.sd_upscalers = datas
--- a/modules/ngrok.py
+++ b/modules/ngrok.py
@ -1,14 +1,23 @@
 from pyngrok import ngrok, conf, exception

-
 def connect(token, port, region):
-    if token == None:
+    account = None
+    if token is None:
        token = 'None'
+    else:
+        if ':' in token:
+            # token = authtoken:username:password
+            account = token.split(':')[1] + ':' + token.split(':')[-1]
+            token = token.split(':')[0]
+
    config = conf.PyngrokConfig(
        auth_token=token, region=region
    )
    try:
-        public_url = ngrok.connect(port, pyngrok_config=config).public_url
+        if account is None:
+            public_url = ngrok.connect(port, pyngrok_config=config, bind_tls=True).public_url
+        else:
+            public_url = ngrok.connect(port, pyngrok_config=config, bind_tls=True, auth=account).public_url
    except exception.PyngrokNgrokError:
        print(f'Invalid ngrok authtoken, ngrok connection aborted.\n'
              f'Your token: {token}, get the right one on https://dashboard.ngrok.com/get-started/your-authtoken')
--- a/modules/paths.py
+++ b/modules/paths.py
@ -9,7 +9,7 @@ sys.path.insert(0, script_path)

 # search for directory of stable diffusion in following places
 sd_path = None
-possible_sd_paths = [os.path.join(script_path, 'repositories/stable-diffusion'), '.', os.path.dirname(script_path)]
+possible_sd_paths = [os.path.join(script_path, 'repositories/stable-diffusion-stability-ai'), '.', os.path.dirname(script_path)]
 for possible_sd_path in possible_sd_paths:
    if os.path.exists(os.path.join(possible_sd_path, 'ldm/models/diffusion/ddpm.py')):
        sd_path = os.path.abspath(possible_sd_path)
--- a/modules/processing.py
+++ b/modules/processing.py
@ -2,6 +2,7 @@ import json
 import math
 import os
 import sys
+import warnings

 import torch
 import numpy as np
@ -12,15 +13,21 @@ from skimage import exposure
 from typing import Any, Dict, List, Optional

 import modules.sd_hijack
-from modules import devices, prompt_parser, masking, sd_samplers, lowvram, generation_parameters_copypaste
+from modules import devices, prompt_parser, masking, sd_samplers, lowvram, generation_parameters_copypaste, script_callbacks
 from modules.sd_hijack import model_hijack
 from modules.shared import opts, cmd_opts, state
 import modules.shared as shared
 import modules.face_restoration
 import modules.images as images
 import modules.styles
+import modules.sd_models as sd_models
+import modules.sd_vae as sd_vae
 import logging
+from ldm.data.util import AddMiDaS
+from ldm.models.diffusion.ddpm import LatentDepth2ImageDiffusion

+from einops import repeat, rearrange
+from blendmodes.blend import blendLayers, BlendType

 # some of those options should not be changed at all because they would break the model, so I removed them from options.
 opt_C = 4
@ -33,17 +40,19 @@ def setup_color_correction(image):
    return correction_target


-def apply_color_correction(correction, image):
+def apply_color_correction(correction, original_image):
    logging.info("Applying color correction.")
    image = Image.fromarray(cv2.cvtColor(exposure.match_histograms(
        cv2.cvtColor(
-            np.asarray(image),
+            np.asarray(original_image),
            cv2.COLOR_RGB2LAB
        ),
        correction,
        channel_axis=2
    ), cv2.COLOR_LAB2RGB).astype("uint8"))
-
+    
+    image = blendLayers(image, original_image, BlendType.LUMINOSITY)
+    
    return image


@ -66,19 +75,33 @@ def apply_overlay(image, paste_loc, index, overlays):

    return image

-def get_correct_sampler(p):
-    if isinstance(p, modules.processing.StableDiffusionProcessingTxt2Img):
-        return sd_samplers.samplers
-    elif isinstance(p, modules.processing.StableDiffusionProcessingImg2Img):
-        return sd_samplers.samplers_for_img2img
-    elif isinstance(p, modules.api.processing.StableDiffusionProcessingAPI):
-        return sd_samplers.samplers
+
+def txt2img_image_conditioning(sd_model, x, width, height):
+    if sd_model.model.conditioning_key not in {'hybrid', 'concat'}:
+        # Dummy zero conditioning if we're not using inpainting model.
+        # Still takes up a bit of memory, but no encoder call.
+        # Pretty sure we can just make this a 1x1 image since its not going to be used besides its batch size.
+        return x.new_zeros(x.shape[0], 5, 1, 1, dtype=x.dtype, device=x.device)
+
+    # The "masked-image" in this case will just be all zeros since the entire image is masked.
+    image_conditioning = torch.zeros(x.shape[0], 3, height, width, device=x.device)
+    image_conditioning = sd_model.get_first_stage_encoding(sd_model.encode_first_stage(image_conditioning))
+
+    # Add the fake full 1s mask to the first dimension.
+    image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
+    image_conditioning = image_conditioning.to(x.dtype)
+
+    return image_conditioning
+

 class StableDiffusionProcessing():
    """
    The first set of paramaters: sd_models -> do_not_reload_embeddings represent the minimum required to create a StableDiffusionProcessing
    """
-    def __init__(self, sd_model=None, outpath_samples=None, outpath_grids=None, prompt: str = "", styles: List[str] = None, seed: int = -1, subseed: int = -1, subseed_strength: float = 0, seed_resize_from_h: int = -1, seed_resize_from_w: int = -1, seed_enable_extras: bool = True, sampler_index: int = 0, batch_size: int = 1, n_iter: int = 1, steps: int = 50, cfg_scale: float = 7.0, width: int = 512, height: int = 512, restore_faces: bool = False, tiling: bool = False, do_not_save_samples: bool = False, do_not_save_grid: bool = False, extra_generation_params: Dict[Any, Any] = None, overlay_images: Any = None, negative_prompt: str = None, eta: float = None, do_not_reload_embeddings: bool = False, denoising_strength: float = 0, ddim_discretize: str = None, s_churn: float = 0.0, s_tmax: float = None, s_tmin: float = 0.0, s_noise: float = 1.0, override_settings: Dict[str, Any] = None):
+    def __init__(self, sd_model=None, outpath_samples=None, outpath_grids=None, prompt: str = "", styles: List[str] = None, seed: int = -1, subseed: int = -1, subseed_strength: float = 0, seed_resize_from_h: int = -1, seed_resize_from_w: int = -1, seed_enable_extras: bool = True, sampler_name: str = None, batch_size: int = 1, n_iter: int = 1, steps: int = 50, cfg_scale: float = 7.0, width: int = 512, height: int = 512, restore_faces: bool = False, tiling: bool = False, do_not_save_samples: bool = False, do_not_save_grid: bool = False, extra_generation_params: Dict[Any, Any] = None, overlay_images: Any = None, negative_prompt: str = None, eta: float = None, do_not_reload_embeddings: bool = False, denoising_strength: float = 0, ddim_discretize: str = None, s_churn: float = 0.0, s_tmax: float = None, s_tmin: float = 0.0, s_noise: float = 1.0, override_settings: Dict[str, Any] = None, override_settings_restore_afterwards: bool = True, sampler_index: int = None):
+        if sampler_index is not None:
+            print("sampler_index argument for StableDiffusionProcessing does not do anything; use sampler_name", file=sys.stderr)
+
        self.sd_model = sd_model
        self.outpath_samples: str = outpath_samples
        self.outpath_grids: str = outpath_grids
@ -91,7 +114,7 @@ class StableDiffusionProcessing():
        self.subseed_strength: float = subseed_strength
        self.seed_resize_from_h: int = seed_resize_from_h
        self.seed_resize_from_w: int = seed_resize_from_w
-        self.sampler_index: int = sampler_index
+        self.sampler_name: str = sampler_name
        self.batch_size: int = batch_size
        self.n_iter: int = n_iter
        self.steps: int = steps
@ -116,6 +139,8 @@ class StableDiffusionProcessing():
        self.s_tmax = s_tmax or float('inf')  # not representable as a standard ui option
        self.s_noise = s_noise or opts.s_noise
        self.override_settings = {k: v for k, v in (override_settings or {}).items() if k not in shared.restricted_opts}
+        self.override_settings_restore_afterwards = override_settings_restore_afterwards
+        self.is_using_inpainting_conditioning = False

        if not seed_enable_extras:
            self.subseed = -1
@ -126,33 +151,37 @@ class StableDiffusionProcessing():
        self.scripts = None
        self.script_args = None
        self.all_prompts = None
+        self.all_negative_prompts = None
        self.all_seeds = None
        self.all_subseeds = None
+        self.iteration = 0

    def txt2img_image_conditioning(self, x, width=None, height=None):
-        if self.sampler.conditioning_key not in {'hybrid', 'concat'}:
-            # Dummy zero conditioning if we're not using inpainting model.
-            # Still takes up a bit of memory, but no encoder call.
-            # Pretty sure we can just make this a 1x1 image since its not going to be used besides its batch size.
-            return x.new_zeros(x.shape[0], 5, 1, 1)
+        self.is_using_inpainting_conditioning = self.sd_model.model.conditioning_key in {'hybrid', 'concat'}

-        height = height or self.height
-        width = width or self.width
+        return txt2img_image_conditioning(self.sd_model, x, width or self.width, height or self.height)

-        # The "masked-image" in this case will just be all zeros since the entire image is masked.
-        image_conditioning = torch.zeros(x.shape[0], 3, height, width, device=x.device)
-        image_conditioning = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(image_conditioning)) 
+    def depth2img_image_conditioning(self, source_image):
+        # Use the AddMiDaS helper to Format our source image to suit the MiDaS model
+        transformer = AddMiDaS(model_type="dpt_hybrid")
+        transformed = transformer({"jpg": rearrange(source_image[0], "c h w -> h w c")})
+        midas_in = torch.from_numpy(transformed["midas_in"][None, ...]).to(device=shared.device)
+        midas_in = repeat(midas_in, "1 ... -> n ...", n=self.batch_size)

-        # Add the fake full 1s mask to the first dimension.
-        image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
-        image_conditioning = image_conditioning.to(x.dtype)            
+        conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(source_image))
+        conditioning = torch.nn.functional.interpolate(
+            self.sd_model.depth_model(midas_in),
+            size=conditioning_image.shape[2:],
+            mode="bicubic",
+            align_corners=False,
+        )

-        return image_conditioning
+        (depth_min, depth_max) = torch.aminmax(conditioning)
+        conditioning = 2. * (conditioning - depth_min) / (depth_max - depth_min) - 1.
+        return conditioning

-    def img2img_image_conditioning(self, source_image, latent_image, image_mask = None):
-        if self.sampler.conditioning_key not in {'hybrid', 'concat'}:
-            # Dummy zero conditioning if we're not using inpainting model.
-            return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
+    def inpainting_image_conditioning(self, source_image, latent_image, image_mask = None):
+        self.is_using_inpainting_conditioning = True

        # Handle the different mask inputs
        if image_mask is not None:
@ -176,7 +205,7 @@ class StableDiffusionProcessing():
            source_image * (1.0 - conditioning_mask),
            getattr(self, "inpainting_mask_weight", shared.opts.inpainting_mask_weight)
        )
-        
+
        # Encode the new masked image using first stage of network.
        conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(conditioning_image))

@ -188,6 +217,18 @@ class StableDiffusionProcessing():

        return image_conditioning

+    def img2img_image_conditioning(self, source_image, latent_image, image_mask=None):
+        # HACK: Using introspection as the Depth2Image model doesn't appear to uniquely
+        # identify itself with a field common to all models. The conditioning_key is also hybrid.
+        if isinstance(self.sd_model, LatentDepth2ImageDiffusion):
+            return self.depth2img_image_conditioning(source_image)
+
+        if self.sampler.conditioning_key in {'hybrid', 'concat'}:
+            return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
+
+        # Dummy zero conditioning if we're not using inpainting or depth model.
+        return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
+
    def init(self, all_prompts, all_seeds, all_subseeds):
        pass

@ -200,7 +241,7 @@ class StableDiffusionProcessing():


 class Processed:
-    def __init__(self, p: StableDiffusionProcessing, images_list, seed=-1, info="", subseed=None, all_prompts=None, all_seeds=None, all_subseeds=None, index_of_first_image=0, infotexts=None):
+    def __init__(self, p: StableDiffusionProcessing, images_list, seed=-1, info="", subseed=None, all_prompts=None, all_negative_prompts=None, all_seeds=None, all_subseeds=None, index_of_first_image=0, infotexts=None, comments=""):
        self.images = images_list
        self.prompt = p.prompt
        self.negative_prompt = p.negative_prompt
@ -208,10 +249,10 @@ class Processed:
        self.subseed = subseed
        self.subseed_strength = p.subseed_strength
        self.info = info
+        self.comments = comments
        self.width = p.width
        self.height = p.height
-        self.sampler_index = p.sampler_index
-        self.sampler = sd_samplers.samplers[p.sampler_index].name
+        self.sampler_name = p.sampler_name
        self.cfg_scale = p.cfg_scale
        self.steps = p.steps
        self.batch_size = p.batch_size
@ -238,17 +279,20 @@ class Processed:
        self.negative_prompt = self.negative_prompt if type(self.negative_prompt) != list else self.negative_prompt[0]
        self.seed = int(self.seed if type(self.seed) != list else self.seed[0]) if self.seed is not None else -1
        self.subseed = int(self.subseed if type(self.subseed) != list else self.subseed[0]) if self.subseed is not None else -1
+        self.is_using_inpainting_conditioning = p.is_using_inpainting_conditioning

-        self.all_prompts = all_prompts or [self.prompt]
-        self.all_seeds = all_seeds or [self.seed]
-        self.all_subseeds = all_subseeds or [self.subseed]
+        self.all_prompts = all_prompts or p.all_prompts or [self.prompt]
+        self.all_negative_prompts = all_negative_prompts or p.all_negative_prompts or [self.negative_prompt]
+        self.all_seeds = all_seeds or p.all_seeds or [self.seed]
+        self.all_subseeds = all_subseeds or p.all_subseeds or [self.subseed]
        self.infotexts = infotexts or [info]

    def js(self):
        obj = {
-            "prompt": self.prompt,
+            "prompt": self.all_prompts[0],
            "all_prompts": self.all_prompts,
-            "negative_prompt": self.negative_prompt,
+            "negative_prompt": self.all_negative_prompts[0],
+            "all_negative_prompts": self.all_negative_prompts,
            "seed": self.seed,
            "all_seeds": self.all_seeds,
            "subseed": self.subseed,
@ -256,8 +300,7 @@ class Processed:
            "subseed_strength": self.subseed_strength,
            "width": self.width,
            "height": self.height,
-            "sampler_index": self.sampler_index,
-            "sampler": self.sampler,
+            "sampler_name": self.sampler_name,
            "cfg_scale": self.cfg_scale,
            "steps": self.steps,
            "batch_size": self.batch_size,
@ -273,11 +316,12 @@ class Processed:
            "styles": self.styles,
            "job_timestamp": self.job_timestamp,
            "clip_skip": self.clip_skip,
+            "is_using_inpainting_conditioning": self.is_using_inpainting_conditioning,
        }

        return json.dumps(obj)

-    def infotext(self,  p: StableDiffusionProcessing, index):
+    def infotext(self, p: StableDiffusionProcessing, index):
        return create_infotext(p, self.all_prompts, self.all_seeds, self.all_subseeds, comments=[], position_in_batch=index % self.batch_size, iteration=index // self.batch_size)


@ -297,13 +341,14 @@ def slerp(val, low, high):


 def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, seed_resize_from_h=0, seed_resize_from_w=0, p=None):
+    eta_noise_seed_delta = opts.eta_noise_seed_delta or 0
    xs = []

    # if we have multiple seeds, this means we are working with batch size>1; this then
    # enables the generation of additional tensors with noise that the sampler will use during its processing.
    # Using those pre-generated tensors instead of simple torch.randn allows a batch with seeds [100, 101] to
    # produce the same images as with two batches [100], [101].
-    if p is not None and p.sampler is not None and (len(seeds) > 1 and opts.enable_batch_seeds or opts.eta_noise_seed_delta > 0):
+    if p is not None and p.sampler is not None and (len(seeds) > 1 and opts.enable_batch_seeds or eta_noise_seed_delta > 0):
        sampler_noises = [[] for _ in range(p.sampler.number_of_needed_noises(p))]
    else:
        sampler_noises = None
@ -343,8 +388,8 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see
        if sampler_noises is not None:
            cnt = p.sampler.number_of_needed_noises(p)

-            if opts.eta_noise_seed_delta > 0:
-                torch.manual_seed(seed + opts.eta_noise_seed_delta)
+            if eta_noise_seed_delta > 0:
+                torch.manual_seed(seed + eta_noise_seed_delta)

            for j in range(cnt):
                sampler_noises[j].append(devices.randn_without_seed(tuple(noise_shape)))
@ -377,14 +422,14 @@ def fix_seed(p):
    p.subseed = get_fixed_seed(p.subseed)


-def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration=0, position_in_batch=0):
+def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iteration=0, position_in_batch=0):
    index = position_in_batch + iteration * p.batch_size

    clip_skip = getattr(p, 'clip_skip', opts.CLIP_stop_at_last_layers)

    generation_params = {
        "Steps": p.steps,
-        "Sampler": get_correct_sampler(p)[p.sampler_index].name,
+        "Sampler": p.sampler_name,
        "CFG scale": p.cfg_scale,
        "Seed": all_seeds[index],
        "Face restoration": (opts.face_restoration_model if p.restore_faces else None),
@ -392,6 +437,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration
        "Model hash": getattr(p, 'sd_model_hash', None if not opts.add_model_hash_to_info or not shared.sd_model.sd_model_hash else shared.sd_model.sd_model_hash),
        "Model": (None if not opts.add_model_name_to_info or not shared.sd_model.sd_checkpoint_info.model_name else shared.sd_model.sd_checkpoint_info.model_name.replace(',', '').replace(':', '')),
        "Hypernet": (None if shared.loaded_hypernetwork is None else shared.loaded_hypernetwork.name),
+        "Hypernet hash": (None if shared.loaded_hypernetwork is None else sd_models.model_hash(shared.loaded_hypernetwork.filename)),
        "Hypernet strength": (None if shared.loaded_hypernetwork is None or shared.opts.sd_hypernetwork_strength >= 1 else shared.opts.sd_hypernetwork_strength),
        "Batch size": (None if p.batch_size < 2 else p.batch_size),
        "Batch pos": (None if p.batch_size < 2 else position_in_batch),
@ -399,6 +445,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration
        "Variation seed strength": (None if p.subseed_strength == 0 else p.subseed_strength),
        "Seed resize from": (None if p.seed_resize_from_w == 0 or p.seed_resize_from_h == 0 else f"{p.seed_resize_from_w}x{p.seed_resize_from_h}"),
        "Denoising strength": getattr(p, 'denoising_strength', None),
+        "Conditional mask weight": getattr(p, "inpainting_mask_weight", shared.opts.inpainting_mask_weight) if p.is_using_inpainting_conditioning else None,
        "Eta": (None if p.sampler is None or p.sampler.eta == p.sampler.default_eta else p.sampler.eta),
        "Clip skip": None if clip_skip <= 1 else clip_skip,
        "ENSD": None if opts.eta_noise_seed_delta == 0 else opts.eta_noise_seed_delta,
@ -408,7 +455,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration

    generation_params_text = ", ".join([k if k == v else f'{k}: {generation_parameters_copypaste.quote(v)}' for k, v in generation_params.items() if v is not None])

-    negative_prompt_text = "\nNegative prompt: " + p.negative_prompt if p.negative_prompt else ""
+    negative_prompt_text = "\nNegative prompt: " + p.all_negative_prompts[index] if p.all_negative_prompts[index] else ""

    return f"{all_prompts[index]}{negative_prompt_text}\n{generation_params_text}".strip()

@ -418,13 +465,21 @@ def process_images(p: StableDiffusionProcessing) -> Processed:

    try:
        for k, v in p.override_settings.items():
-            setattr(opts, k, v) # we don't call onchange for simplicity which makes changing model, hypernet impossible
+            setattr(opts, k, v)
+            if k == 'sd_hypernetwork': shared.reload_hypernetworks()  # make onchange call for changing hypernet
+            if k == 'sd_model_checkpoint': sd_models.reload_model_weights()  # make onchange call for changing SD model
+            if k == 'sd_vae': sd_vae.reload_vae_weights()  # make onchange call for changing VAE

        res = process_images_inner(p)

    finally:
-        for k, v in stored_opts.items():
-            setattr(opts, k, v)
+        # restore opts to original state
+        if p.override_settings_restore_afterwards:
+            for k, v in stored_opts.items():
+                setattr(opts, k, v)
+                if k == 'sd_hypernetwork': shared.reload_hypernetworks()
+                if k == 'sd_model_checkpoint': sd_models.reload_model_weights()
+                if k == 'sd_vae': sd_vae.reload_vae_weights()

    return res

@ -437,10 +492,6 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
    else:
        assert p.prompt is not None

-    with open(os.path.join(shared.script_path, "params.txt"), "w", encoding="utf8") as file:
-        processed = Processed(p, [], p.seed, "")
-        file.write(processed.infotext(p, 0))
-
    devices.torch_gc()

    seed = get_fixed_seed(p.seed)
@ -451,12 +502,15 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

    comments = {}

-    shared.prompt_styles.apply_styles(p)
-
    if type(p.prompt) == list:
-        p.all_prompts = p.prompt
+        p.all_prompts = [shared.prompt_styles.apply_styles_to_prompt(x, p.styles) for x in p.prompt]
    else:
-        p.all_prompts = p.batch_size * p.n_iter * [p.prompt]
+        p.all_prompts = p.batch_size * p.n_iter * [shared.prompt_styles.apply_styles_to_prompt(p.prompt, p.styles)]
+
+    if type(p.negative_prompt) == list:
+        p.all_negative_prompts = [shared.prompt_styles.apply_negative_styles_to_prompt(x, p.styles) for x in p.negative_prompt]
+    else:
+        p.all_negative_prompts = p.batch_size * p.n_iter * [shared.prompt_styles.apply_negative_styles_to_prompt(p.negative_prompt, p.styles)]

    if type(seed) == list:
        p.all_seeds = seed
@ -471,6 +525,10 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
    def infotext(iteration=0, position_in_batch=0):
        return create_infotext(p, p.all_prompts, p.all_seeds, p.all_subseeds, comments, iteration, position_in_batch)

+    with open(os.path.join(shared.script_path, "params.txt"), "w", encoding="utf8") as file:
+        processed = Processed(p, [], p.seed, "")
+        file.write(processed.infotext(p, 0))
+
    if os.path.exists(cmd_opts.embeddings_dir) and not p.do_not_reload_embeddings:
        model_hijack.embedding_db.load_textual_inversion_embeddings()

@ -488,13 +546,16 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
            state.job_count = p.n_iter

        for n in range(p.n_iter):
+            p.iteration = n
+
            if state.skipped:
                state.skipped = False
-            
+
            if state.interrupted:
                break

            prompts = p.all_prompts[n * p.batch_size:(n + 1) * p.batch_size]
+            negative_prompts = p.all_negative_prompts[n * p.batch_size:(n + 1) * p.batch_size]
            seeds = p.all_seeds[n * p.batch_size:(n + 1) * p.batch_size]
            subseeds = p.all_subseeds[n * p.batch_size:(n + 1) * p.batch_size]

@ -505,7 +566,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                p.scripts.process_batch(p, batch_number=n, prompts=prompts, seeds=seeds, subseeds=subseeds)

            with devices.autocast():
-                uc = prompt_parser.get_learned_conditioning(shared.sd_model, len(prompts) * [p.negative_prompt], p.steps)
+                uc = prompt_parser.get_learned_conditioning(shared.sd_model, negative_prompts, p.steps)
                c = prompt_parser.get_multicond_learned_conditioning(shared.sd_model, prompts, p.steps)

            if len(model_hijack.comments) > 0:
@ -518,8 +579,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
            with devices.autocast():
                samples_ddim = p.sample(conditioning=c, unconditional_conditioning=uc, seeds=seeds, subseeds=subseeds, subseed_strength=p.subseed_strength, prompts=prompts)

-            samples_ddim = samples_ddim.to(devices.dtype_vae)
-            x_samples_ddim = decode_first_stage(p.sd_model, samples_ddim)
+            x_samples_ddim = [decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae))[0].cpu() for i in range(samples_ddim.size(0))]
+            x_samples_ddim = torch.stack(x_samples_ddim).float()
            x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)

            del samples_ddim
@ -529,9 +590,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

            devices.torch_gc()

-            if opts.filter_nsfw:
-                import modules.safety as safety
-                x_samples_ddim = modules.safety.censor_batch(x_samples_ddim)
+            if p.scripts is not None:
+                p.scripts.postprocess_batch(p, x_samples_ddim, batch_number=n)

            for i, x_sample in enumerate(x_samples_ddim):
                x_sample = 255. * np.moveaxis(x_sample.cpu().numpy(), 0, 2)
@ -565,7 +625,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                    image.info["parameters"] = text
                output_images.append(image)

-            del x_samples_ddim 
+            del x_samples_ddim

            devices.torch_gc()

@ -591,7 +651,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

    devices.torch_gc()

-    res = Processed(p, output_images, p.all_seeds[0], infotext() + "".join(["\n\n" + x for x in comments]), subseed=p.all_subseeds[0], all_prompts=p.all_prompts, all_seeds=p.all_seeds, all_subseeds=p.all_subseeds, index_of_first_image=index_of_first_image, infotexts=infotexts)
+    res = Processed(p, output_images, p.all_seeds[0], infotext(), comments="".join(["\n\n" + x for x in comments]), subseed=p.all_subseeds[0], index_of_first_image=index_of_first_image, infotexts=infotexts)

    if p.scripts is not None:
        p.scripts.postprocess(p, res)
@ -602,14 +662,18 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
 class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
    sampler = None

-    def __init__(self, enable_hr: bool=False, denoising_strength: float=0.75, firstphase_width: int=0, firstphase_height: int=0, **kwargs):
+    def __init__(self, enable_hr: bool = False, denoising_strength: float = 0.75, firstphase_width: int = 0, firstphase_height: int = 0, hr_scale: float = 2.0, hr_upscaler: str = None, **kwargs):
        super().__init__(**kwargs)
        self.enable_hr = enable_hr
        self.denoising_strength = denoising_strength
-        self.firstphase_width = firstphase_width
-        self.firstphase_height = firstphase_height
-        self.truncate_x = 0
-        self.truncate_y = 0
+        self.hr_scale = hr_scale
+        self.hr_upscaler = hr_upscaler
+
+        if firstphase_width != 0 or firstphase_height != 0:
+            print("firstphase_width/firstphase_height no longer supported; use hr_scale", file=sys.stderr)
+            self.hr_scale = self.width / firstphase_width
+            self.width = firstphase_width
+            self.height = firstphase_height

    def init(self, all_prompts, all_seeds, all_subseeds):
        if self.enable_hr:
@ -618,62 +682,45 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
            else:
                state.job_count = state.job_count * 2

-            self.extra_generation_params["First pass size"] = f"{self.firstphase_width}x{self.firstphase_height}"
-
-            if self.firstphase_width == 0 or self.firstphase_height == 0:
-                desired_pixel_count = 512 * 512
-                actual_pixel_count = self.width * self.height
-                scale = math.sqrt(desired_pixel_count / actual_pixel_count)
-                self.firstphase_width = math.ceil(scale * self.width / 64) * 64
-                self.firstphase_height = math.ceil(scale * self.height / 64) * 64
-                firstphase_width_truncated = int(scale * self.width)
-                firstphase_height_truncated = int(scale * self.height)
-
-            else:
-
-                width_ratio = self.width / self.firstphase_width
-                height_ratio = self.height / self.firstphase_height
-
-                if width_ratio > height_ratio:
-                    firstphase_width_truncated = self.firstphase_width
-                    firstphase_height_truncated = self.firstphase_width * self.height / self.width
-                else:
-                    firstphase_width_truncated = self.firstphase_height * self.width / self.height
-                    firstphase_height_truncated = self.firstphase_height
-
-            self.truncate_x = int(self.firstphase_width - firstphase_width_truncated) // opt_f
-            self.truncate_y = int(self.firstphase_height - firstphase_height_truncated) // opt_f
+            self.extra_generation_params["Hires upscale"] = self.hr_scale
+            if self.hr_upscaler is not None:
+                self.extra_generation_params["Hires upscaler"] = self.hr_upscaler

    def sample(self, conditioning, unconditional_conditioning, seeds, subseeds, subseed_strength, prompts):
-        self.sampler = sd_samplers.create_sampler_with_index(sd_samplers.samplers, self.sampler_index, self.sd_model)
+        self.sampler = sd_samplers.create_sampler(self.sampler_name, self.sd_model)
+
+        latent_scale_mode = shared.latent_upscale_modes.get(self.hr_upscaler, None) if self.hr_upscaler is not None else shared.latent_upscale_modes.get(shared.latent_upscale_default_mode, "nearest")
+        if self.enable_hr and latent_scale_mode is None:
+            assert len([x for x in shared.sd_upscalers if x.name == self.hr_upscaler]) > 0, f"could not find upscaler named {self.hr_upscaler}"
+
+        x = create_random_tensors([opt_C, self.height // opt_f, self.width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self)
+        samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x))

        if not self.enable_hr:
-            x = create_random_tensors([opt_C, self.height // opt_f, self.width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self)
-            samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x))
            return samples

-        x = create_random_tensors([opt_C, self.firstphase_height // opt_f, self.firstphase_width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self)
-        samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x, self.firstphase_width, self.firstphase_height))
+        target_width = int(self.width * self.hr_scale)
+        target_height = int(self.height * self.hr_scale)

-        samples = samples[:, :, self.truncate_y//2:samples.shape[2]-self.truncate_y//2, self.truncate_x//2:samples.shape[3]-self.truncate_x//2]
-
-        """saves image before applying hires fix, if enabled in options; takes as an arguyment either an image or batch with latent space images"""
        def save_intermediate(image, index):
+            """saves image before applying hires fix, if enabled in options; takes as an argument either an image or batch with latent space images"""
+
            if not opts.save or self.do_not_save_samples or not opts.save_images_before_highres_fix:
                return

            if not isinstance(image, Image.Image):
-                image = sd_samplers.sample_to_image(image, index)
+                image = sd_samplers.sample_to_image(image, index, approximation=0)

-            images.save_image(image, self.outpath_samples, "", seeds[index], prompts[index], opts.samples_format, suffix="-before-highres-fix")
+            info = create_infotext(self, self.all_prompts, self.all_seeds, self.all_subseeds, [], iteration=self.iteration, position_in_batch=index)
+            images.save_image(image, self.outpath_samples, "", seeds[index], prompts[index], opts.samples_format, info=info, suffix="-before-highres-fix")

-        if opts.use_scale_latent_for_hires_fix:
+        if latent_scale_mode is not None:
            for i in range(samples.shape[0]):
                save_intermediate(samples, i)

-            samples = torch.nn.functional.interpolate(samples, size=(self.height // opt_f, self.width // opt_f), mode="bilinear")
+            samples = torch.nn.functional.interpolate(samples, size=(target_height // opt_f, target_width // opt_f), mode=latent_scale_mode["mode"], antialias=latent_scale_mode["antialias"])

-            # Avoid making the inpainting conditioning unless necessary as 
+            # Avoid making the inpainting conditioning unless necessary as
            # this does need some extra compute to decode / encode the image again.
            if getattr(self, "inpainting_mask_weight", shared.opts.inpainting_mask_weight) < 1.0:
                image_conditioning = self.img2img_image_conditioning(decode_first_stage(self.sd_model, samples), samples)
@ -691,7 +738,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):

                save_intermediate(image, i)

-                image = images.resize_image(0, image, self.width, self.height)
+                image = images.resize_image(0, image, target_width, target_height, upscaler_name=self.hr_upscaler)
                image = np.array(image).astype(np.float32) / 255.0
                image = np.moveaxis(image, 2, 0)
                batch_images.append(image)
@ -706,9 +753,9 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):

        shared.state.nextjob()

-        self.sampler = sd_samplers.create_sampler_with_index(sd_samplers.samplers, self.sampler_index, self.sd_model)
+        self.sampler = sd_samplers.create_sampler(self.sampler_name, self.sd_model)

-        noise = create_random_tensors(samples.shape[1:], seeds=seeds, subseeds=subseeds, subseed_strength=subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self)
+        noise = create_random_tensors(samples.shape[1:], seeds=seeds, subseeds=subseeds, subseed_strength=subseed_strength, p=self)

        # GC now before running the next img2img to prevent running out of memory
        x = None
@ -722,7 +769,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
 class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
    sampler = None

-    def __init__(self, init_images: list=None, resize_mode: int=0, denoising_strength: float=0.75, mask: Any=None, mask_blur: int=4, inpainting_fill: int=0, inpaint_full_res: bool=True, inpaint_full_res_padding: int=0, inpainting_mask_invert: int=0, **kwargs):
+    def __init__(self, init_images: list = None, resize_mode: int = 0, denoising_strength: float = 0.75, mask: Any = None, mask_blur: int = 4, inpainting_fill: int = 0, inpaint_full_res: bool = True, inpaint_full_res_padding: int = 0, inpainting_mask_invert: int = 0, initial_noise_multiplier: float = None, **kwargs):
        super().__init__(**kwargs)

        self.init_images = init_images
@ -730,7 +777,6 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
        self.denoising_strength: float = denoising_strength
        self.init_latent = None
        self.image_mask = mask
-        #self.image_unblurred_mask = None
        self.latent_mask = None
        self.mask_for_overlay = None
        self.mask_blur = mask_blur
@ -738,66 +784,68 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
        self.inpaint_full_res = inpaint_full_res
        self.inpaint_full_res_padding = inpaint_full_res_padding
        self.inpainting_mask_invert = inpainting_mask_invert
+        self.initial_noise_multiplier = opts.initial_noise_multiplier if initial_noise_multiplier is None else initial_noise_multiplier
        self.mask = None
        self.nmask = None
        self.image_conditioning = None

    def init(self, all_prompts, all_seeds, all_subseeds):
-        self.sampler = sd_samplers.create_sampler_with_index(sd_samplers.samplers_for_img2img, self.sampler_index, self.sd_model)
+        self.sampler = sd_samplers.create_sampler(self.sampler_name, self.sd_model)
        crop_region = None

-        if self.image_mask is not None:
-            self.image_mask = self.image_mask.convert('L')
+        image_mask = self.image_mask
+
+        if image_mask is not None:
+            image_mask = image_mask.convert('L')

            if self.inpainting_mask_invert:
-                self.image_mask = ImageOps.invert(self.image_mask)
-
-            #self.image_unblurred_mask = self.image_mask
+                image_mask = ImageOps.invert(image_mask)

            if self.mask_blur > 0:
-                self.image_mask = self.image_mask.filter(ImageFilter.GaussianBlur(self.mask_blur))
+                image_mask = image_mask.filter(ImageFilter.GaussianBlur(self.mask_blur))

            if self.inpaint_full_res:
-                self.mask_for_overlay = self.image_mask
-                mask = self.image_mask.convert('L')
+                self.mask_for_overlay = image_mask
+                mask = image_mask.convert('L')
                crop_region = masking.get_crop_region(np.array(mask), self.inpaint_full_res_padding)
                crop_region = masking.expand_crop_region(crop_region, self.width, self.height, mask.width, mask.height)
                x1, y1, x2, y2 = crop_region

                mask = mask.crop(crop_region)
-                self.image_mask = images.resize_image(2, mask, self.width, self.height)
+                image_mask = images.resize_image(2, mask, self.width, self.height)
                self.paste_to = (x1, y1, x2-x1, y2-y1)
            else:
-                self.image_mask = images.resize_image(self.resize_mode, self.image_mask, self.width, self.height)
-                np_mask = np.array(self.image_mask)
+                image_mask = images.resize_image(self.resize_mode, image_mask, self.width, self.height)
+                np_mask = np.array(image_mask)
                np_mask = np.clip((np_mask.astype(np.float32)) * 2, 0, 255).astype(np.uint8)
                self.mask_for_overlay = Image.fromarray(np_mask)

            self.overlay_images = []

-        latent_mask = self.latent_mask if self.latent_mask is not None else self.image_mask
+        latent_mask = self.latent_mask if self.latent_mask is not None else image_mask

        add_color_corrections = opts.img2img_color_correction and self.color_corrections is None
        if add_color_corrections:
            self.color_corrections = []
        imgs = []
        for img in self.init_images:
-            image = img.convert("RGB")
+            image = images.flatten(img, opts.img2img_background_color)

-            if crop_region is None:
+            if crop_region is None and self.resize_mode != 3:
                image = images.resize_image(self.resize_mode, image, self.width, self.height)

-            if self.image_mask is not None:
+            if image_mask is not None:
                image_masked = Image.new('RGBa', (image.width, image.height))
                image_masked.paste(image.convert("RGBA").convert("RGBa"), mask=ImageOps.invert(self.mask_for_overlay.convert('L')))

                self.overlay_images.append(image_masked.convert('RGBA'))

+            # crop_region is not None if we are doing inpaint full res
            if crop_region is not None:
                image = image.crop(crop_region)
                image = images.resize_image(2, image, self.width, self.height)

-            if self.image_mask is not None:
+            if image_mask is not None:
                if self.inpainting_fill != 1:
                    image = masking.fill(image, latent_mask)

@ -829,7 +877,10 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):

        self.init_latent = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(image))

-        if self.image_mask is not None:
+        if self.resize_mode == 3:
+            self.init_latent = torch.nn.functional.interpolate(self.init_latent, size=(self.height // opt_f, self.width // opt_f), mode="bilinear")
+
+        if image_mask is not None:
            init_mask = latent_mask
            latmask = init_mask.convert('RGB').resize((self.init_latent.shape[3], self.init_latent.shape[2]))
            latmask = np.moveaxis(np.array(latmask, dtype=np.float32), 2, 0) / 255
@ -846,11 +897,15 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
            elif self.inpainting_fill == 3:
                self.init_latent = self.init_latent * self.mask

-        self.image_conditioning = self.img2img_image_conditioning(image, self.init_latent, self.image_mask)
+        self.image_conditioning = self.img2img_image_conditioning(image, self.init_latent, image_mask)

    def sample(self, conditioning, unconditional_conditioning, seeds, subseeds, subseed_strength, prompts):
        x = create_random_tensors([opt_C, self.height // opt_f, self.width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self)

+        if self.initial_noise_multiplier != 1.0:
+            self.extra_generation_params["Noise multiplier"] = self.initial_noise_multiplier
+            x *= self.initial_noise_multiplier
+
        samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning, image_conditioning=self.image_conditioning)

        if self.mask is not None:
--- a/modules/safe.py
+++ b/modules/safe.py
@ -37,16 +37,16 @@ class RestrictedUnpickler(pickle.Unpickler):

        if module == 'collections' and name == 'OrderedDict':
            return getattr(collections, name)
-        if module == 'torch._utils' and name in ['_rebuild_tensor_v2', '_rebuild_parameter']:
+        if module == 'torch._utils' and name in ['_rebuild_tensor_v2', '_rebuild_parameter', '_rebuild_device_tensor_from_numpy']:
            return getattr(torch._utils, name)
-        if module == 'torch' and name in ['FloatStorage', 'HalfStorage', 'IntStorage', 'LongStorage', 'DoubleStorage', 'ByteStorage']:
+        if module == 'torch' and name in ['FloatStorage', 'HalfStorage', 'IntStorage', 'LongStorage', 'DoubleStorage', 'ByteStorage', 'float32']:
            return getattr(torch, name)
        if module == 'torch.nn.modules.container' and name in ['ParameterDict']:
            return getattr(torch.nn.modules.container, name)
-        if module == 'numpy.core.multiarray' and name == 'scalar':
-            return numpy.core.multiarray.scalar
-        if module == 'numpy' and name == 'dtype':
-            return numpy.dtype
+        if module == 'numpy.core.multiarray' and name in ['scalar', '_reconstruct']:
+            return getattr(numpy.core.multiarray, name)
+        if module == 'numpy' and name in ['dtype', 'ndarray']:
+            return getattr(numpy, name)
        if module == '_codecs' and name == 'encode':
            return encode
        if module == "pytorch_lightning.callbacks" and name == 'model_checkpoint':
@ -62,14 +62,12 @@ class RestrictedUnpickler(pickle.Unpickler):
        raise Exception(f"global '{module}/{name}' is forbidden")


-allowed_zip_names = ["archive/data.pkl", "archive/version"]
-allowed_zip_names_re = re.compile(r"^archive/data/\d+$")
-
+# Regular expression that accepts 'dirname/version', 'dirname/data.pkl', and 'dirname/data/<number>'
+allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|(data\.pkl))$")
+data_pkl_re = re.compile(r"^([^/]+)/data\.pkl$")

 def check_zip_filenames(filename, names):
    for name in names:
-        if name in allowed_zip_names:
-            continue
        if allowed_zip_names_re.match(name):
            continue

@ -83,7 +81,13 @@ def check_pt(filename, extra_handler):
        with zipfile.ZipFile(filename) as z:
            check_zip_filenames(filename, z.namelist())

-            with z.open('archive/data.pkl') as file:
+            # find filename of data.pkl in zip file: '<directory name>/data.pkl'
+            data_pkl_filenames = [f for f in z.namelist() if data_pkl_re.match(f)]
+            if len(data_pkl_filenames) == 0:
+                raise Exception(f"data.pkl not found in {filename}")
+            if len(data_pkl_filenames) > 1:
+                raise Exception(f"Multiple data.pkl found in {filename}")
+            with z.open(data_pkl_filenames[0]) as file:
                unpickler = RestrictedUnpickler(file)
                unpickler.extra_handler = extra_handler
                unpickler.load()
@ -99,12 +103,12 @@ def check_pt(filename, extra_handler):


 def load(filename, *args, **kwargs):
-    return load_with_extra(filename, *args, **kwargs)
+    return load_with_extra(filename, extra_handler=global_extra_handler, *args, **kwargs)


 def load_with_extra(filename, extra_handler=None, *args, **kwargs):
    """
-    this functon is intended to be used by extensions that want to load models with
+    this function is intended to be used by extensions that want to load models with
    some extra classes in them that the usual unpickler would find suspicious.

    Use the extra_handler argument to specify a function that takes module and field name as text,
@ -133,19 +137,56 @@ def load_with_extra(filename, extra_handler=None, *args, **kwargs):
    except pickle.UnpicklingError:
        print(f"Error verifying pickled file from {filename}:", file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
-        print(f"-----> !!!! The file is most likely corrupted !!!! <-----", file=sys.stderr)
-        print(f"You can skip this check with --disable-safe-unpickle commandline argument, but that is not going to help you.\n\n", file=sys.stderr)
+        print("-----> !!!! The file is most likely corrupted !!!! <-----", file=sys.stderr)
+        print("You can skip this check with --disable-safe-unpickle commandline argument, but that is not going to help you.\n\n", file=sys.stderr)
        return None

    except Exception:
        print(f"Error verifying pickled file from {filename}:", file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
-        print(f"\nThe file may be malicious, so the program is not going to read it.", file=sys.stderr)
-        print(f"You can skip this check with --disable-safe-unpickle commandline argument.\n\n", file=sys.stderr)
+        print("\nThe file may be malicious, so the program is not going to read it.", file=sys.stderr)
+        print("You can skip this check with --disable-safe-unpickle commandline argument.\n\n", file=sys.stderr)
        return None

    return unsafe_torch_load(filename, *args, **kwargs)


+class Extra:
+    """
+    A class for temporarily setting the global handler for when you can't explicitly call load_with_extra
+    (because it's not your code making the torch.load call). The intended use is like this:
+
+```
+import torch
+from modules import safe
+
+def handler(module, name):
+    if module == 'torch' and name in ['float64', 'float16']:
+        return getattr(torch, name)
+
+    return None
+
+with safe.Extra(handler):
+    x = torch.load('model.pt')
+```
+    """
+
+    def __init__(self, handler):
+        self.handler = handler
+
+    def __enter__(self):
+        global global_extra_handler
+
+        assert global_extra_handler is None, 'already inside an Extra() block'
+        global_extra_handler = self.handler
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global global_extra_handler
+
+        global_extra_handler = None
+
+
 unsafe_torch_load = torch.load
 torch.load = load
+global_extra_handler = None
+
--- a/modules/safety.py
+++ b/modules/safety.py
@ -1,42 +0,0 @@
-import torch
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor
-from PIL import Image
-
-import modules.shared as shared
-
-safety_model_id = "CompVis/stable-diffusion-safety-checker"
-safety_feature_extractor = None
-safety_checker = None
-
-def numpy_to_pil(images):
-    """
-    Convert a numpy image or a batch of images to a PIL image.
-    """
-    if images.ndim == 3:
-        images = images[None, ...]
-    images = (images * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-
-    return pil_images
-
-# check and replace nsfw content
-def check_safety(x_image):
-    global safety_feature_extractor, safety_checker
-
-    if safety_feature_extractor is None:
-        safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
-        safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
-
-    safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
-    x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
-
-    return x_checked_image, has_nsfw_concept
-
-
-def censor_batch(x):
-    x_samples_ddim_numpy = x.cpu().permute(0, 2, 3, 1).numpy()
-    x_checked_image, has_nsfw_concept = check_safety(x_samples_ddim_numpy)
-    x = torch.from_numpy(x_checked_image).permute(0, 3, 1, 2)
-
-    return x
--- a/modules/script_callbacks.py
+++ b/modules/script_callbacks.py
@ -51,6 +51,13 @@ class UiTrainTabParams:
        self.txt2img_preview_params = txt2img_preview_params


+class ImageGridLoopParams:
+    def __init__(self, imgs, cols, rows):
+        self.imgs = imgs
+        self.cols = cols
+        self.rows = rows
+
+
 ScriptCallback = namedtuple("ScriptCallback", ["script", "callback"])
 callback_map = dict(
    callbacks_app_started=[],
@ -61,6 +68,9 @@ callback_map = dict(
    callbacks_before_image_saved=[],
    callbacks_image_saved=[],
    callbacks_cfg_denoiser=[],
+    callbacks_before_component=[],
+    callbacks_after_component=[],
+    callbacks_image_grid=[],
 )


@ -137,6 +147,30 @@ def cfg_denoiser_callback(params: CFGDenoiserParams):
            report_exception(c, 'cfg_denoiser_callback')


+def before_component_callback(component, **kwargs):
+    for c in callback_map['callbacks_before_component']:
+        try:
+            c.callback(component, **kwargs)
+        except Exception:
+            report_exception(c, 'before_component_callback')
+
+
+def after_component_callback(component, **kwargs):
+    for c in callback_map['callbacks_after_component']:
+        try:
+            c.callback(component, **kwargs)
+        except Exception:
+            report_exception(c, 'after_component_callback')
+
+
+def image_grid_callback(params: ImageGridLoopParams):
+    for c in callback_map['callbacks_image_grid']:
+        try:
+            c.callback(params)
+        except Exception:
+            report_exception(c, 'image_grid')
+
+
 def add_callback(callbacks, fun):
    stack = [x for x in inspect.stack() if x.filename != __file__]
    filename = stack[0].filename if len(stack) > 0 else 'unknown file'
@ -220,3 +254,28 @@ def on_cfg_denoiser(callback):
        - params: CFGDenoiserParams - parameters to be passed to the inner model and sampling state details.
    """
    add_callback(callback_map['callbacks_cfg_denoiser'], callback)
+
+
+def on_before_component(callback):
+    """register a function to be called before a component is created.
+    The callback is called with arguments:
+        - component - gradio component that is about to be created.
+        - **kwargs - args to gradio.components.IOComponent.__init__ function
+
+    Use elem_id/label fields of kwargs to figure out which component it is.
+    This can be useful to inject your own components somewhere in the middle of vanilla UI.
+    """
+    add_callback(callback_map['callbacks_before_component'], callback)
+
+
+def on_after_component(callback):
+    """register a function to be called after a component is created. See on_before_component for more."""
+    add_callback(callback_map['callbacks_after_component'], callback)
+
+
+def on_image_grid(callback):
+    """register a function to be called before making an image grid.
+    The callback is called with one argument:
+       - params: ImageGridLoopParams - parameters to be used for grid creation. Can be modified.
+    """
+    add_callback(callback_map['callbacks_image_grid'], callback)
--- a/modules/script_loading.py
+++ b/modules/script_loading.py
@ -0,0 +1,34 @@
+import os
+import sys
+import traceback
+from types import ModuleType
+
+
+def load_module(path):
+    with open(path, "r", encoding="utf8") as file:
+        text = file.read()
+
+    compiled = compile(text, path, 'exec')
+    module = ModuleType(os.path.basename(path))
+    exec(compiled, module.__dict__)
+
+    return module
+
+
+def preload_extensions(extensions_dir, parser):
+    if not os.path.isdir(extensions_dir):
+        return
+
+    for dirname in sorted(os.listdir(extensions_dir)):
+        preload_script = os.path.join(extensions_dir, dirname, "preload.py")
+        if not os.path.isfile(preload_script):
+            continue
+
+        try:
+            module = load_module(preload_script)
+            if hasattr(module, 'preload'):
+                module.preload(parser)
+
+        except Exception:
+            print(f"Error running preload() for {preload_script}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
--- a/modules/scripts.py
+++ b/modules/scripts.py
@ -6,7 +6,7 @@ from collections import namedtuple
 import gradio as gr

 from modules.processing import StableDiffusionProcessing
-from modules import shared, paths, script_callbacks, extensions
+from modules import shared, paths, script_callbacks, extensions, script_loading

 AlwaysVisible = object()

@ -17,6 +17,9 @@ class Script:
    args_to = None
    alwayson = False

+    is_txt2img = False
+    is_img2img = False
+
    """A gr.Group component that has all script's UI inside it"""
    group = None

@ -33,7 +36,7 @@ class Script:
    def ui(self, is_img2img):
        """this function should create gradio UI elements. See https://gradio.app/docs/#components
        The return value should be an array of all components that are used in processing.
-        Values of those returned componenbts will be passed to run() and process() functions.
+        Values of those returned components will be passed to run() and process() functions.
        """

        pass
@ -44,7 +47,7 @@ class Script:

        This function should return:
         - False if the script should not be shown in UI at all
-         - True if the script should be shown in UI if it's scelected in the scripts drowpdown
+         - True if the script should be shown in UI if it's selected in the scripts dropdown
         - script.AlwaysVisible if the script should be shown in UI at all times
         """

@ -85,6 +88,17 @@ class Script:

        pass

+    def postprocess_batch(self, p, *args, **kwargs):
+        """
+        Same as process_batch(), but called for every batch after it has been generated.
+
+        **kwargs will have same items as process_batch, and also:
+          - batch_number - index of current batch, from 0 to number of batches-1
+          - images - torch tensor with all generated images, with values ranging from 0 to 1;
+        """
+
+        pass
+
    def postprocess(self, p, processed, *args):
        """
        This function is called after processing ends for AlwaysVisible scripts.
@ -93,6 +107,23 @@ class Script:

        pass

+    def before_component(self, component, **kwargs):
+        """
+        Called before a component is created.
+        Use elem_id/label fields of kwargs to figure out which component it is.
+        This can be useful to inject your own components somewhere in the middle of vanilla UI.
+        You can return created components in the ui() function to add them to the list of arguments for your processing functions
+        """
+
+        pass
+
+    def after_component(self, component, **kwargs):
+        """
+        Called after a component is created. Same as above.
+        """
+
+        pass
+
    def describe(self):
        """unused"""
        return ""
@ -140,7 +171,7 @@ def list_files_with_name(filename):
            continue

        path = os.path.join(dirpath, filename)
-        if os.path.isfile(filename):
+        if os.path.isfile(path):
            res.append(path)

    return res
@ -161,13 +192,7 @@ def load_scripts():
                sys.path = [scriptfile.basedir] + sys.path
            current_basedir = scriptfile.basedir

-            with open(scriptfile.path, "r", encoding="utf8") as file:
-                text = file.read()
-
-            from types import ModuleType
-            compiled = compile(text, scriptfile.path, 'exec')
-            module = ModuleType(scriptfile.filename)
-            exec(compiled, module.__dict__)
+            module = script_loading.load_module(scriptfile.path)

            for key, script_class in module.__dict__.items():
                if type(script_class) == type and issubclass(script_class, Script):
@ -201,12 +226,18 @@ class ScriptRunner:
        self.titles = []
        self.infotext_fields = []

-    def setup_ui(self, is_img2img):
+    def initialize_scripts(self, is_img2img):
+        self.scripts.clear()
+        self.alwayson_scripts.clear()
+        self.selectable_scripts.clear()
+
        for script_class, path, basedir in scripts_data:
            script = script_class()
            script.filename = path
+            script.is_txt2img = not is_img2img
+            script.is_img2img = is_img2img

-            visibility = script.show(is_img2img)
+            visibility = script.show(script.is_img2img)

            if visibility == AlwaysVisible:
                self.scripts.append(script)
@ -217,6 +248,7 @@ class ScriptRunner:
                self.scripts.append(script)
                self.selectable_scripts.append(script)

+    def setup_ui(self):
        self.titles = [wrap_call(script.title, script.filename, "title") or f"{script.filename} [error]" for script in self.selectable_scripts]

        inputs = [None]
@ -226,7 +258,7 @@ class ScriptRunner:
            script.args_from = len(inputs)
            script.args_to = len(inputs)

-            controls = wrap_call(script.ui, script.filename, "ui", is_img2img)
+            controls = wrap_call(script.ui, script.filename, "ui", script.is_img2img)

            if controls is None:
                return
@ -326,33 +358,53 @@ class ScriptRunner:
                print(f"Error running postprocess: {script.filename}", file=sys.stderr)
                print(traceback.format_exc(), file=sys.stderr)

+    def postprocess_batch(self, p, images, **kwargs):
+        for script in self.alwayson_scripts:
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.postprocess_batch(p, *script_args, images=images, **kwargs)
+            except Exception:
+                print(f"Error running postprocess_batch: {script.filename}", file=sys.stderr)
+                print(traceback.format_exc(), file=sys.stderr)
+
+    def before_component(self, component, **kwargs):
+        for script in self.scripts:
+            try:
+                script.before_component(component, **kwargs)
+            except Exception:
+                print(f"Error running before_component: {script.filename}", file=sys.stderr)
+                print(traceback.format_exc(), file=sys.stderr)
+
+    def after_component(self, component, **kwargs):
+        for script in self.scripts:
+            try:
+                script.after_component(component, **kwargs)
+            except Exception:
+                print(f"Error running after_component: {script.filename}", file=sys.stderr)
+                print(traceback.format_exc(), file=sys.stderr)
+
    def reload_sources(self, cache):
        for si, script in list(enumerate(self.scripts)):
-            with open(script.filename, "r", encoding="utf8") as file:
-                args_from = script.args_from
-                args_to = script.args_to
-                filename = script.filename
-                text = file.read()
+            args_from = script.args_from
+            args_to = script.args_to
+            filename = script.filename

-                from types import ModuleType
+            module = cache.get(filename, None)
+            if module is None:
+                module = script_loading.load_module(script.filename)
+                cache[filename] = module

-                module = cache.get(filename, None)
-                if module is None:
-                    compiled = compile(text, filename, 'exec')
-                    module = ModuleType(script.filename)
-                    exec(compiled, module.__dict__)
-                    cache[filename] = module
-
-                for key, script_class in module.__dict__.items():
-                    if type(script_class) == type and issubclass(script_class, Script):
-                        self.scripts[si] = script_class()
-                        self.scripts[si].filename = filename
-                        self.scripts[si].args_from = args_from
-                        self.scripts[si].args_to = args_to
+            for key, script_class in module.__dict__.items():
+                if type(script_class) == type and issubclass(script_class, Script):
+                    self.scripts[si] = script_class()
+                    self.scripts[si].filename = filename
+                    self.scripts[si].args_from = args_from
+                    self.scripts[si].args_to = args_to


 scripts_txt2img = ScriptRunner()
 scripts_img2img = ScriptRunner()
+scripts_current: ScriptRunner = None


 def reload_script_body_only():
@ -369,3 +421,22 @@ def reload_scripts():
    scripts_txt2img = ScriptRunner()
    scripts_img2img = ScriptRunner()

+
+def IOComponent_init(self, *args, **kwargs):
+    if scripts_current is not None:
+        scripts_current.before_component(self, **kwargs)
+
+    script_callbacks.before_component_callback(self, **kwargs)
+
+    res = original_IOComponent_init(self, *args, **kwargs)
+
+    script_callbacks.after_component_callback(self, **kwargs)
+
+    if scripts_current is not None:
+        scripts_current.after_component(self, **kwargs)
+
+    return res
+
+
+original_IOComponent_init = gr.components.IOComponent.__init__
+gr.components.IOComponent.__init__ = IOComponent_init
--- a/modules/scunet_model.py
+++ b/modules/scunet_model.py
@ -1,87 +0,0 @@
-import os.path
-import sys
-import traceback
-
-import PIL.Image
-import numpy as np
-import torch
-from basicsr.utils.download_util import load_file_from_url
-
-import modules.upscaler
-from modules import devices, modelloader
-from modules.scunet_model_arch import SCUNet as net
-
-
-class UpscalerScuNET(modules.upscaler.Upscaler):
-    def __init__(self, dirname):
-        self.name = "ScuNET"
-        self.model_name = "ScuNET GAN"
-        self.model_name2 = "ScuNET PSNR"
-        self.model_url = "https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_gan.pth"
-        self.model_url2 = "https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_psnr.pth"
-        self.user_path = dirname
-        super().__init__()
-        model_paths = self.find_models(ext_filter=[".pth"])
-        scalers = []
-        add_model2 = True
-        for file in model_paths:
-            if "http" in file:
-                name = self.model_name
-            else:
-                name = modelloader.friendly_name(file)
-            if name == self.model_name2 or file == self.model_url2:
-                add_model2 = False
-            try:
-                scaler_data = modules.upscaler.UpscalerData(name, file, self, 4)
-                scalers.append(scaler_data)
-            except Exception:
-                print(f"Error loading ScuNET model: {file}", file=sys.stderr)
-                print(traceback.format_exc(), file=sys.stderr)
-        if add_model2:
-            scaler_data2 = modules.upscaler.UpscalerData(self.model_name2, self.model_url2, self)
-            scalers.append(scaler_data2)
-        self.scalers = scalers
-
-    def do_upscale(self, img: PIL.Image, selected_file):
-        torch.cuda.empty_cache()
-
-        model = self.load_model(selected_file)
-        if model is None:
-            return img
-
-        device = devices.device_scunet
-        img = np.array(img)
-        img = img[:, :, ::-1]
-        img = np.moveaxis(img, 2, 0) / 255
-        img = torch.from_numpy(img).float()
-        img = devices.mps_contiguous_to(img.unsqueeze(0), device)
-
-        with torch.no_grad():
-            output = model(img)
-        output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
-        output = 255. * np.moveaxis(output, 0, 2)
-        output = output.astype(np.uint8)
-        output = output[:, :, ::-1]
-        torch.cuda.empty_cache()
-        return PIL.Image.fromarray(output, 'RGB')
-
-    def load_model(self, path: str):
-        device = devices.device_scunet
-        if "http" in path:
-            filename = load_file_from_url(url=self.model_url, model_dir=self.model_path, file_name="%s.pth" % self.name,
-                                          progress=True)
-        else:
-            filename = path
-        if not os.path.exists(os.path.join(self.model_path, filename)) or filename is None:
-            print(f"ScuNET: Unable to load model from {filename}", file=sys.stderr)
-            return None
-
-        model = net(in_nc=3, config=[4, 4, 4, 4, 4, 4, 4], dim=64)
-        model.load_state_dict(torch.load(filename), strict=True)
-        model.eval()
-        for k, v in model.named_parameters():
-            v.requires_grad = False
-        model = model.to(device)
-
-        return model
-
--- a/modules/scunet_model_arch.py
+++ b/modules/scunet_model_arch.py
@ -1,265 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import torch
-import torch.nn as nn
-from einops import rearrange
-from einops.layers.torch import Rearrange
-from timm.models.layers import trunc_normal_, DropPath
-
-
-class WMSA(nn.Module):
-    """ Self-attention module in Swin Transformer
-    """
-
-    def __init__(self, input_dim, output_dim, head_dim, window_size, type):
-        super(WMSA, self).__init__()
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.head_dim = head_dim
-        self.scale = self.head_dim ** -0.5
-        self.n_heads = input_dim // head_dim
-        self.window_size = window_size
-        self.type = type
-        self.embedding_layer = nn.Linear(self.input_dim, 3 * self.input_dim, bias=True)
-
-        self.relative_position_params = nn.Parameter(
-            torch.zeros((2 * window_size - 1) * (2 * window_size - 1), self.n_heads))
-
-        self.linear = nn.Linear(self.input_dim, self.output_dim)
-
-        trunc_normal_(self.relative_position_params, std=.02)
-        self.relative_position_params = torch.nn.Parameter(
-            self.relative_position_params.view(2 * window_size - 1, 2 * window_size - 1, self.n_heads).transpose(1,
-                                                                                                                 2).transpose(
-                0, 1))
-
-    def generate_mask(self, h, w, p, shift):
-        """ generating the mask of SW-MSA
-        Args:
-            shift: shift parameters in CyclicShift.
-        Returns:
-            attn_mask: should be (1 1 w p p),
-        """
-        # supporting square.
-        attn_mask = torch.zeros(h, w, p, p, p, p, dtype=torch.bool, device=self.relative_position_params.device)
-        if self.type == 'W':
-            return attn_mask
-
-        s = p - shift
-        attn_mask[-1, :, :s, :, s:, :] = True
-        attn_mask[-1, :, s:, :, :s, :] = True
-        attn_mask[:, -1, :, :s, :, s:] = True
-        attn_mask[:, -1, :, s:, :, :s] = True
-        attn_mask = rearrange(attn_mask, 'w1 w2 p1 p2 p3 p4 -> 1 1 (w1 w2) (p1 p2) (p3 p4)')
-        return attn_mask
-
-    def forward(self, x):
-        """ Forward pass of Window Multi-head Self-attention module.
-        Args:
-            x: input tensor with shape of [b h w c];
-            attn_mask: attention mask, fill -inf where the value is True;
-        Returns:
-            output: tensor shape [b h w c]
-        """
-        if self.type != 'W': x = torch.roll(x, shifts=(-(self.window_size // 2), -(self.window_size // 2)), dims=(1, 2))
-        x = rearrange(x, 'b (w1 p1) (w2 p2) c -> b w1 w2 p1 p2 c', p1=self.window_size, p2=self.window_size)
-        h_windows = x.size(1)
-        w_windows = x.size(2)
-        # square validation
-        # assert h_windows == w_windows
-
-        x = rearrange(x, 'b w1 w2 p1 p2 c -> b (w1 w2) (p1 p2) c', p1=self.window_size, p2=self.window_size)
-        qkv = self.embedding_layer(x)
-        q, k, v = rearrange(qkv, 'b nw np (threeh c) -> threeh b nw np c', c=self.head_dim).chunk(3, dim=0)
-        sim = torch.einsum('hbwpc,hbwqc->hbwpq', q, k) * self.scale
-        # Adding learnable relative embedding
-        sim = sim + rearrange(self.relative_embedding(), 'h p q -> h 1 1 p q')
-        # Using Attn Mask to distinguish different subwindows.
-        if self.type != 'W':
-            attn_mask = self.generate_mask(h_windows, w_windows, self.window_size, shift=self.window_size // 2)
-            sim = sim.masked_fill_(attn_mask, float("-inf"))
-
-        probs = nn.functional.softmax(sim, dim=-1)
-        output = torch.einsum('hbwij,hbwjc->hbwic', probs, v)
-        output = rearrange(output, 'h b w p c -> b w p (h c)')
-        output = self.linear(output)
-        output = rearrange(output, 'b (w1 w2) (p1 p2) c -> b (w1 p1) (w2 p2) c', w1=h_windows, p1=self.window_size)
-
-        if self.type != 'W': output = torch.roll(output, shifts=(self.window_size // 2, self.window_size // 2),
-                                                 dims=(1, 2))
-        return output
-
-    def relative_embedding(self):
-        cord = torch.tensor(np.array([[i, j] for i in range(self.window_size) for j in range(self.window_size)]))
-        relation = cord[:, None, :] - cord[None, :, :] + self.window_size - 1
-        # negative is allowed
-        return self.relative_position_params[:, relation[:, :, 0].long(), relation[:, :, 1].long()]
-
-
-class Block(nn.Module):
-    def __init__(self, input_dim, output_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
-        """ SwinTransformer Block
-        """
-        super(Block, self).__init__()
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        assert type in ['W', 'SW']
-        self.type = type
-        if input_resolution <= window_size:
-            self.type = 'W'
-
-        self.ln1 = nn.LayerNorm(input_dim)
-        self.msa = WMSA(input_dim, input_dim, head_dim, window_size, self.type)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.ln2 = nn.LayerNorm(input_dim)
-        self.mlp = nn.Sequential(
-            nn.Linear(input_dim, 4 * input_dim),
-            nn.GELU(),
-            nn.Linear(4 * input_dim, output_dim),
-        )
-
-    def forward(self, x):
-        x = x + self.drop_path(self.msa(self.ln1(x)))
-        x = x + self.drop_path(self.mlp(self.ln2(x)))
-        return x
-
-
-class ConvTransBlock(nn.Module):
-    def __init__(self, conv_dim, trans_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
-        """ SwinTransformer and Conv Block
-        """
-        super(ConvTransBlock, self).__init__()
-        self.conv_dim = conv_dim
-        self.trans_dim = trans_dim
-        self.head_dim = head_dim
-        self.window_size = window_size
-        self.drop_path = drop_path
-        self.type = type
-        self.input_resolution = input_resolution
-
-        assert self.type in ['W', 'SW']
-        if self.input_resolution <= self.window_size:
-            self.type = 'W'
-
-        self.trans_block = Block(self.trans_dim, self.trans_dim, self.head_dim, self.window_size, self.drop_path,
-                                 self.type, self.input_resolution)
-        self.conv1_1 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
-        self.conv1_2 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
-
-        self.conv_block = nn.Sequential(
-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False),
-            nn.ReLU(True),
-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False)
-        )
-
-    def forward(self, x):
-        conv_x, trans_x = torch.split(self.conv1_1(x), (self.conv_dim, self.trans_dim), dim=1)
-        conv_x = self.conv_block(conv_x) + conv_x
-        trans_x = Rearrange('b c h w -> b h w c')(trans_x)
-        trans_x = self.trans_block(trans_x)
-        trans_x = Rearrange('b h w c -> b c h w')(trans_x)
-        res = self.conv1_2(torch.cat((conv_x, trans_x), dim=1))
-        x = x + res
-
-        return x
-
-
-class SCUNet(nn.Module):
-    # def __init__(self, in_nc=3, config=[2, 2, 2, 2, 2, 2, 2], dim=64, drop_path_rate=0.0, input_resolution=256):
-    def __init__(self, in_nc=3, config=None, dim=64, drop_path_rate=0.0, input_resolution=256):
-        super(SCUNet, self).__init__()
-        if config is None:
-            config = [2, 2, 2, 2, 2, 2, 2]
-        self.config = config
-        self.dim = dim
-        self.head_dim = 32
-        self.window_size = 8
-
-        # drop path rate for each layer
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(config))]
-
-        self.m_head = [nn.Conv2d(in_nc, dim, 3, 1, 1, bias=False)]
-
-        begin = 0
-        self.m_down1 = [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution)
-                        for i in range(config[0])] + \
-                       [nn.Conv2d(dim, 2 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[0]
-        self.m_down2 = [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution // 2)
-                        for i in range(config[1])] + \
-                       [nn.Conv2d(2 * dim, 4 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[1]
-        self.m_down3 = [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution // 4)
-                        for i in range(config[2])] + \
-                       [nn.Conv2d(4 * dim, 8 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[2]
-        self.m_body = [ConvTransBlock(4 * dim, 4 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                      'W' if not i % 2 else 'SW', input_resolution // 8)
-                       for i in range(config[3])]
-
-        begin += config[3]
-        self.m_up3 = [nn.ConvTranspose2d(8 * dim, 4 * dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution // 4)
-                      for i in range(config[4])]
-
-        begin += config[4]
-        self.m_up2 = [nn.ConvTranspose2d(4 * dim, 2 * dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution // 2)
-                      for i in range(config[5])]
-
-        begin += config[5]
-        self.m_up1 = [nn.ConvTranspose2d(2 * dim, dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution)
-                      for i in range(config[6])]
-
-        self.m_tail = [nn.Conv2d(dim, in_nc, 3, 1, 1, bias=False)]
-
-        self.m_head = nn.Sequential(*self.m_head)
-        self.m_down1 = nn.Sequential(*self.m_down1)
-        self.m_down2 = nn.Sequential(*self.m_down2)
-        self.m_down3 = nn.Sequential(*self.m_down3)
-        self.m_body = nn.Sequential(*self.m_body)
-        self.m_up3 = nn.Sequential(*self.m_up3)
-        self.m_up2 = nn.Sequential(*self.m_up2)
-        self.m_up1 = nn.Sequential(*self.m_up1)
-        self.m_tail = nn.Sequential(*self.m_tail)
-        # self.apply(self._init_weights)
-
-    def forward(self, x0):
-
-        h, w = x0.size()[-2:]
-        paddingBottom = int(np.ceil(h / 64) * 64 - h)
-        paddingRight = int(np.ceil(w / 64) * 64 - w)
-        x0 = nn.ReplicationPad2d((0, paddingRight, 0, paddingBottom))(x0)
-
-        x1 = self.m_head(x0)
-        x2 = self.m_down1(x1)
-        x3 = self.m_down2(x2)
-        x4 = self.m_down3(x3)
-        x = self.m_body(x4)
-        x = self.m_up3(x + x4)
-        x = self.m_up2(x + x3)
-        x = self.m_up1(x + x2)
-        x = self.m_tail(x + x1)
-
-        x = x[..., :h, :w]
-
-        return x
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@ -1,61 +1,81 @@
-import math
-import os
-import sys
-import traceback
 import torch
-import numpy as np
-from torch import einsum
 from torch.nn.functional import silu

 import modules.textual_inversion.textual_inversion
-from modules import prompt_parser, devices, sd_hijack_optimizations, shared
-from modules.shared import opts, device, cmd_opts
+from modules import devices, sd_hijack_optimizations, shared, sd_hijack_checkpoint
+from modules.hypernetworks import hypernetwork
+from modules.shared import cmd_opts
+from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr
+
 from modules.sd_hijack_optimizations import invokeAI_mps_available

 import ldm.modules.attention
 import ldm.modules.diffusionmodules.model
+import ldm.modules.diffusionmodules.openaimodel
+import ldm.models.diffusion.ddim
+import ldm.models.diffusion.plms
+import ldm.modules.encoders.modules

 attention_CrossAttention_forward = ldm.modules.attention.CrossAttention.forward
 diffusionmodules_model_nonlinearity = ldm.modules.diffusionmodules.model.nonlinearity
 diffusionmodules_model_AttnBlock_forward = ldm.modules.diffusionmodules.model.AttnBlock.forward

+# new memory efficient cross attention blocks do not support hypernets and we already
+# have memory efficient cross attention anyway, so this disables SD2.0's memory efficient cross attention
+ldm.modules.attention.MemoryEfficientCrossAttention = ldm.modules.attention.CrossAttention
+ldm.modules.attention.BasicTransformerBlock.ATTENTION_MODES["softmax-xformers"] = ldm.modules.attention.CrossAttention
+
+# silence new console spam from SD2
+ldm.modules.attention.print = lambda *args: None
+ldm.modules.diffusionmodules.model.print = lambda *args: None
+

 def apply_optimizations():
    undo_optimizations()

    ldm.modules.diffusionmodules.model.nonlinearity = silu
+    ldm.modules.diffusionmodules.openaimodel.th = sd_hijack_unet.th
+    
+    optimization_method = None

    if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and (6, 0) <= torch.cuda.get_device_capability(shared.device) <= (9, 0)):
        print("Applying xformers cross attention optimization.")
        ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward
        ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward
+        optimization_method = 'xformers'
    elif cmd_opts.opt_split_attention_v1:
        print("Applying v1 cross attention optimization.")
        ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_v1
+        optimization_method = 'V1'
    elif not cmd_opts.disable_opt_split_attention and (cmd_opts.opt_split_attention_invokeai or not torch.cuda.is_available()):
        if not invokeAI_mps_available and shared.device.type == 'mps':
            print("The InvokeAI cross attention optimization for MPS requires the psutil package which is not installed.")
            print("Applying v1 cross attention optimization.")
            ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_v1
+            optimization_method = 'V1'
        else:
            print("Applying cross attention optimization (InvokeAI).")
            ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_invokeAI
+            optimization_method = 'InvokeAI'
    elif not cmd_opts.disable_opt_split_attention and (cmd_opts.opt_split_attention or torch.cuda.is_available()):
        print("Applying cross attention optimization (Doggettx).")
        ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward
        ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.cross_attention_attnblock_forward
+        optimization_method = 'Doggettx'
+
+    return optimization_method


 def undo_optimizations():
-    from modules.hypernetworks import hypernetwork
-
    ldm.modules.attention.CrossAttention.forward = hypernetwork.attention_CrossAttention_forward
    ldm.modules.diffusionmodules.model.nonlinearity = diffusionmodules_model_nonlinearity
    ldm.modules.diffusionmodules.model.AttnBlock.forward = diffusionmodules_model_AttnBlock_forward


-def get_target_prompt_token_count(token_count):
-    return math.ceil(max(token_count, 1) / 75) * 75
+def fix_checkpoint():
+    ldm.modules.attention.BasicTransformerBlock.forward = sd_hijack_checkpoint.BasicTransformerBlock_forward
+    ldm.modules.diffusionmodules.openaimodel.ResBlock.forward = sd_hijack_checkpoint.ResBlock_forward
+    ldm.modules.diffusionmodules.openaimodel.AttentionBlock.forward = sd_hijack_checkpoint.AttentionBlock_forward


 class StableDiffusionModelHijack:
@ -64,18 +84,31 @@ class StableDiffusionModelHijack:
    layers = None
    circular_enabled = False
    clip = None
+    optimization_method = None

    embedding_db = modules.textual_inversion.textual_inversion.EmbeddingDatabase(cmd_opts.embeddings_dir)

    def hijack(self, m):
-        model_embeddings = m.cond_stage_model.transformer.text_model.embeddings

-        model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
-        m.cond_stage_model = FrozenCLIPEmbedderWithCustomWords(m.cond_stage_model, self)
+        if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation:
+            model_embeddings = m.cond_stage_model.roberta.embeddings
+            model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings, self)
+            m.cond_stage_model = sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords(m.cond_stage_model, self)
+
+        elif type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenCLIPEmbedder:
+            model_embeddings = m.cond_stage_model.transformer.text_model.embeddings
+            model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
+            m.cond_stage_model = sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords(m.cond_stage_model, self)
+
+        elif type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder:
+            m.cond_stage_model.model.token_embedding = EmbeddingsWithFixes(m.cond_stage_model.model.token_embedding, self)
+            m.cond_stage_model = sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords(m.cond_stage_model, self)
+
+        self.optimization_method = apply_optimizations()

        self.clip = m.cond_stage_model
-
-        apply_optimizations()
+        
+        fix_checkpoint()

        def flatten(el):
            flattened = [flatten(children) for children in el.children()]
@ -87,15 +120,22 @@ class StableDiffusionModelHijack:
        self.layers = flatten(m)

    def undo_hijack(self, m):
-        if type(m.cond_stage_model) == FrozenCLIPEmbedderWithCustomWords:
+
+        if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation:
+            m.cond_stage_model = m.cond_stage_model.wrapped 
+
+        elif type(m.cond_stage_model) == sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords:
            m.cond_stage_model = m.cond_stage_model.wrapped

-        model_embeddings = m.cond_stage_model.transformer.text_model.embeddings
-        if type(model_embeddings.token_embedding) == EmbeddingsWithFixes:
-            model_embeddings.token_embedding = model_embeddings.token_embedding.wrapped
+            model_embeddings = m.cond_stage_model.transformer.text_model.embeddings
+            if type(model_embeddings.token_embedding) == EmbeddingsWithFixes:
+                model_embeddings.token_embedding = model_embeddings.token_embedding.wrapped
+        elif type(m.cond_stage_model) == sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords:
+            m.cond_stage_model.wrapped.model.token_embedding = m.cond_stage_model.wrapped.model.token_embedding.wrapped
+            m.cond_stage_model = m.cond_stage_model.wrapped

+        self.apply_circular(False)
        self.layers = None
-        self.circular_enabled = False
        self.clip = None

    def apply_circular(self, enable):
@ -112,261 +152,8 @@ class StableDiffusionModelHijack:

    def tokenize(self, text):
        _, remade_batch_tokens, _, _, _, token_count = self.clip.process_text([text])
-        return remade_batch_tokens[0], token_count, get_target_prompt_token_count(token_count)

-
-class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
-    def __init__(self, wrapped, hijack):
-        super().__init__()
-        self.wrapped = wrapped
-        self.hijack: StableDiffusionModelHijack = hijack
-        self.tokenizer = wrapped.tokenizer
-        self.token_mults = {}
-
-        self.comma_token = [v for k, v in self.tokenizer.get_vocab().items() if k == ',</w>'][0]
-
-        tokens_with_parens = [(k, v) for k, v in self.tokenizer.get_vocab().items() if '(' in k or ')' in k or '[' in k or ']' in k]
-        for text, ident in tokens_with_parens:
-            mult = 1.0
-            for c in text:
-                if c == '[':
-                    mult /= 1.1
-                if c == ']':
-                    mult *= 1.1
-                if c == '(':
-                    mult *= 1.1
-                if c == ')':
-                    mult /= 1.1
-
-            if mult != 1.0:
-                self.token_mults[ident] = mult
-
-    def tokenize_line(self, line, used_custom_terms, hijack_comments):
-        id_end = self.wrapped.tokenizer.eos_token_id
-
-        if opts.enable_emphasis:
-            parsed = prompt_parser.parse_prompt_attention(line)
-        else:
-            parsed = [[line, 1.0]]
-
-        tokenized = self.wrapped.tokenizer([text for text, _ in parsed], truncation=False, add_special_tokens=False)["input_ids"]
-
-        fixes = []
-        remade_tokens = []
-        multipliers = []
-        last_comma = -1
-
-        for tokens, (text, weight) in zip(tokenized, parsed):
-            i = 0
-            while i < len(tokens):
-                token = tokens[i]
-
-                embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(tokens, i)
-
-                if token == self.comma_token:
-                    last_comma = len(remade_tokens)
-                elif opts.comma_padding_backtrack != 0 and max(len(remade_tokens), 1) % 75 == 0 and last_comma != -1 and len(remade_tokens) - last_comma <= opts.comma_padding_backtrack:
-                    last_comma += 1
-                    reloc_tokens = remade_tokens[last_comma:]
-                    reloc_mults = multipliers[last_comma:]
-
-                    remade_tokens = remade_tokens[:last_comma]
-                    length = len(remade_tokens)
-
-                    rem = int(math.ceil(length / 75)) * 75 - length
-                    remade_tokens += [id_end] * rem + reloc_tokens
-                    multipliers = multipliers[:last_comma] + [1.0] * rem + reloc_mults
-
-                if embedding is None:
-                    remade_tokens.append(token)
-                    multipliers.append(weight)
-                    i += 1
-                else:
-                    emb_len = int(embedding.vec.shape[0])
-                    iteration = len(remade_tokens) // 75
-                    if (len(remade_tokens) + emb_len) // 75 != iteration:
-                        rem = (75 * (iteration + 1) - len(remade_tokens))
-                        remade_tokens += [id_end] * rem
-                        multipliers += [1.0] * rem
-                        iteration += 1
-                    fixes.append((iteration, (len(remade_tokens) % 75, embedding)))
-                    remade_tokens += [0] * emb_len
-                    multipliers += [weight] * emb_len
-                    used_custom_terms.append((embedding.name, embedding.checksum()))
-                    i += embedding_length_in_tokens
-
-        token_count = len(remade_tokens)
-        prompt_target_length = get_target_prompt_token_count(token_count)
-        tokens_to_add = prompt_target_length - len(remade_tokens)
-
-        remade_tokens = remade_tokens + [id_end] * tokens_to_add
-        multipliers = multipliers + [1.0] * tokens_to_add
-
-        return remade_tokens, fixes, multipliers, token_count
-
-    def process_text(self, texts):
-        used_custom_terms = []
-        remade_batch_tokens = []
-        hijack_comments = []
-        hijack_fixes = []
-        token_count = 0
-
-        cache = {}
-        batch_multipliers = []
-        for line in texts:
-            if line in cache:
-                remade_tokens, fixes, multipliers = cache[line]
-            else:
-                remade_tokens, fixes, multipliers, current_token_count = self.tokenize_line(line, used_custom_terms, hijack_comments)
-                token_count = max(current_token_count, token_count)
-
-                cache[line] = (remade_tokens, fixes, multipliers)
-
-            remade_batch_tokens.append(remade_tokens)
-            hijack_fixes.append(fixes)
-            batch_multipliers.append(multipliers)
-
-        return batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count
-
-    def process_text_old(self, text):
-        id_start = self.wrapped.tokenizer.bos_token_id
-        id_end = self.wrapped.tokenizer.eos_token_id
-        maxlen = self.wrapped.max_length  # you get to stay at 77
-        used_custom_terms = []
-        remade_batch_tokens = []
-        overflowing_words = []
-        hijack_comments = []
-        hijack_fixes = []
-        token_count = 0
-
-        cache = {}
-        batch_tokens = self.wrapped.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
-        batch_multipliers = []
-        for tokens in batch_tokens:
-            tuple_tokens = tuple(tokens)
-
-            if tuple_tokens in cache:
-                remade_tokens, fixes, multipliers = cache[tuple_tokens]
-            else:
-                fixes = []
-                remade_tokens = []
-                multipliers = []
-                mult = 1.0
-
-                i = 0
-                while i < len(tokens):
-                    token = tokens[i]
-
-                    embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(tokens, i)
-
-                    mult_change = self.token_mults.get(token) if opts.enable_emphasis else None
-                    if mult_change is not None:
-                        mult *= mult_change
-                        i += 1
-                    elif embedding is None:
-                        remade_tokens.append(token)
-                        multipliers.append(mult)
-                        i += 1
-                    else:
-                        emb_len = int(embedding.vec.shape[0])
-                        fixes.append((len(remade_tokens), embedding))
-                        remade_tokens += [0] * emb_len
-                        multipliers += [mult] * emb_len
-                        used_custom_terms.append((embedding.name, embedding.checksum()))
-                        i += embedding_length_in_tokens
-
-                if len(remade_tokens) > maxlen - 2:
-                    vocab = {v: k for k, v in self.wrapped.tokenizer.get_vocab().items()}
-                    ovf = remade_tokens[maxlen - 2:]
-                    overflowing_words = [vocab.get(int(x), "") for x in ovf]
-                    overflowing_text = self.wrapped.tokenizer.convert_tokens_to_string(''.join(overflowing_words))
-                    hijack_comments.append(f"Warning: too many input tokens; some ({len(overflowing_words)}) have been truncated:\n{overflowing_text}\n")
-
-                token_count = len(remade_tokens)
-                remade_tokens = remade_tokens + [id_end] * (maxlen - 2 - len(remade_tokens))
-                remade_tokens = [id_start] + remade_tokens[0:maxlen - 2] + [id_end]
-                cache[tuple_tokens] = (remade_tokens, fixes, multipliers)
-
-            multipliers = multipliers + [1.0] * (maxlen - 2 - len(multipliers))
-            multipliers = [1.0] + multipliers[0:maxlen - 2] + [1.0]
-
-            remade_batch_tokens.append(remade_tokens)
-            hijack_fixes.append(fixes)
-            batch_multipliers.append(multipliers)
-        return batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count
-
-    def forward(self, text):
-        use_old = opts.use_old_emphasis_implementation
-        if use_old:
-            batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text_old(text)
-        else:
-            batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text(text)
-
-        self.hijack.comments += hijack_comments
-
-        if len(used_custom_terms) > 0:
-            self.hijack.comments.append("Used embeddings: " + ", ".join([f'{word} [{checksum}]' for word, checksum in used_custom_terms]))
-
-        if use_old:
-            self.hijack.fixes = hijack_fixes
-            return self.process_tokens(remade_batch_tokens, batch_multipliers)
-
-        z = None
-        i = 0
-        while max(map(len, remade_batch_tokens)) != 0:
-            rem_tokens = [x[75:] for x in remade_batch_tokens]
-            rem_multipliers = [x[75:] for x in batch_multipliers]
-
-            self.hijack.fixes = []
-            for unfiltered in hijack_fixes:
-                fixes = []
-                for fix in unfiltered:
-                    if fix[0] == i:
-                        fixes.append(fix[1])
-                self.hijack.fixes.append(fixes)
-
-            tokens = []
-            multipliers = []
-            for j in range(len(remade_batch_tokens)):
-                if len(remade_batch_tokens[j]) > 0:
-                    tokens.append(remade_batch_tokens[j][:75])
-                    multipliers.append(batch_multipliers[j][:75])
-                else:
-                    tokens.append([self.wrapped.tokenizer.eos_token_id] * 75)
-                    multipliers.append([1.0] * 75)
-
-            z1 = self.process_tokens(tokens, multipliers)
-            z = z1 if z is None else torch.cat((z, z1), axis=-2)
-
-            remade_batch_tokens = rem_tokens
-            batch_multipliers = rem_multipliers
-            i += 1
-
-        return z
-
-    def process_tokens(self, remade_batch_tokens, batch_multipliers):
-        if not opts.use_old_emphasis_implementation:
-            remade_batch_tokens = [[self.wrapped.tokenizer.bos_token_id] + x[:75] + [self.wrapped.tokenizer.eos_token_id] for x in remade_batch_tokens]
-            batch_multipliers = [[1.0] + x[:75] + [1.0] for x in batch_multipliers]
-
-        tokens = torch.asarray(remade_batch_tokens).to(device)
-        outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers)
-
-        if opts.CLIP_stop_at_last_layers > 1:
-            z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
-            z = self.wrapped.transformer.text_model.final_layer_norm(z)
-        else:
-            z = outputs.last_hidden_state
-
-        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
-        batch_multipliers_of_same_length = [x + [1.0] * (75 - len(x)) for x in batch_multipliers]
-        batch_multipliers = torch.asarray(batch_multipliers_of_same_length).to(device)
-        original_mean = z.mean()
-        z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
-        new_mean = z.mean()
-        z *= original_mean / new_mean
-
-        return z
+        return remade_batch_tokens[0], token_count, sd_hijack_clip.get_target_prompt_token_count(token_count)


 class EmbeddingsWithFixes(torch.nn.Module):
@ -406,3 +193,19 @@ def add_circular_option_to_conv_2d():


 model_hijack = StableDiffusionModelHijack()
+
+
+def register_buffer(self, name, attr):
+    """
+    Fix register buffer bug for Mac OS.
+    """
+
+    if type(attr) == torch.Tensor:
+        if attr.device != devices.device:
+            attr = attr.to(device=devices.device, dtype=(torch.float32 if devices.device.type == 'mps' else None))
+
+    setattr(self, name, attr)
+
+
+ldm.models.diffusion.ddim.DDIMSampler.register_buffer = register_buffer
+ldm.models.diffusion.plms.PLMSSampler.register_buffer = register_buffer
--- a/modules/sd_hijack_checkpoint.py
+++ b/modules/sd_hijack_checkpoint.py
@ -0,0 +1,10 @@
+from torch.utils.checkpoint import checkpoint
+
+def BasicTransformerBlock_forward(self, x, context=None):
+    return checkpoint(self._forward, x, context)
+
+def AttentionBlock_forward(self, x):
+    return checkpoint(self._forward, x)
+
+def ResBlock_forward(self, x, emb):
+    return checkpoint(self._forward, x, emb)
--- a/modules/sd_hijack_clip.py
+++ b/modules/sd_hijack_clip.py
@ -0,0 +1,303 @@
+import math
+
+import torch
+
+from modules import prompt_parser, devices
+from modules.shared import opts
+
+def get_target_prompt_token_count(token_count):
+    return math.ceil(max(token_count, 1) / 75) * 75
+
+
+class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
+    def __init__(self, wrapped, hijack):
+        super().__init__()
+        self.wrapped = wrapped
+        self.hijack = hijack
+
+    def tokenize(self, texts):
+        raise NotImplementedError
+
+    def encode_with_transformers(self, tokens):
+        raise NotImplementedError
+
+    def encode_embedding_init_text(self, init_text, nvpt):
+        raise NotImplementedError
+
+    def tokenize_line(self, line, used_custom_terms, hijack_comments):
+        if opts.enable_emphasis:
+            parsed = prompt_parser.parse_prompt_attention(line)
+        else:
+            parsed = [[line, 1.0]]
+
+        tokenized = self.tokenize([text for text, _ in parsed])
+
+        fixes = []
+        remade_tokens = []
+        multipliers = []
+        last_comma = -1
+
+        for tokens, (text, weight) in zip(tokenized, parsed):
+            i = 0
+            while i < len(tokens):
+                token = tokens[i]
+
+                embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(tokens, i)
+
+                if token == self.comma_token:
+                    last_comma = len(remade_tokens)
+                elif opts.comma_padding_backtrack != 0 and max(len(remade_tokens), 1) % 75 == 0 and last_comma != -1 and len(remade_tokens) - last_comma <= opts.comma_padding_backtrack:
+                    last_comma += 1
+                    reloc_tokens = remade_tokens[last_comma:]
+                    reloc_mults = multipliers[last_comma:]
+
+                    remade_tokens = remade_tokens[:last_comma]
+                    length = len(remade_tokens)
+
+                    rem = int(math.ceil(length / 75)) * 75 - length
+                    remade_tokens += [self.id_end] * rem + reloc_tokens
+                    multipliers = multipliers[:last_comma] + [1.0] * rem + reloc_mults
+
+                if embedding is None:
+                    remade_tokens.append(token)
+                    multipliers.append(weight)
+                    i += 1
+                else:
+                    emb_len = int(embedding.vec.shape[0])
+                    iteration = len(remade_tokens) // 75
+                    if (len(remade_tokens) + emb_len) // 75 != iteration:
+                        rem = (75 * (iteration + 1) - len(remade_tokens))
+                        remade_tokens += [self.id_end] * rem
+                        multipliers += [1.0] * rem
+                        iteration += 1
+                    fixes.append((iteration, (len(remade_tokens) % 75, embedding)))
+                    remade_tokens += [0] * emb_len
+                    multipliers += [weight] * emb_len
+                    used_custom_terms.append((embedding.name, embedding.checksum()))
+                    i += embedding_length_in_tokens
+
+        token_count = len(remade_tokens)
+        prompt_target_length = get_target_prompt_token_count(token_count)
+        tokens_to_add = prompt_target_length - len(remade_tokens)
+
+        remade_tokens = remade_tokens + [self.id_end] * tokens_to_add
+        multipliers = multipliers + [1.0] * tokens_to_add
+
+        return remade_tokens, fixes, multipliers, token_count
+
+    def process_text(self, texts):
+        used_custom_terms = []
+        remade_batch_tokens = []
+        hijack_comments = []
+        hijack_fixes = []
+        token_count = 0
+
+        cache = {}
+        batch_multipliers = []
+        for line in texts:
+            if line in cache:
+                remade_tokens, fixes, multipliers = cache[line]
+            else:
+                remade_tokens, fixes, multipliers, current_token_count = self.tokenize_line(line, used_custom_terms, hijack_comments)
+                token_count = max(current_token_count, token_count)
+
+                cache[line] = (remade_tokens, fixes, multipliers)
+
+            remade_batch_tokens.append(remade_tokens)
+            hijack_fixes.append(fixes)
+            batch_multipliers.append(multipliers)
+
+        return batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count
+
+    def process_text_old(self, texts):
+        id_start = self.id_start
+        id_end = self.id_end
+        maxlen = self.wrapped.max_length  # you get to stay at 77
+        used_custom_terms = []
+        remade_batch_tokens = []
+        hijack_comments = []
+        hijack_fixes = []
+        token_count = 0
+
+        cache = {}
+        batch_tokens = self.tokenize(texts)
+        batch_multipliers = []
+        for tokens in batch_tokens:
+            tuple_tokens = tuple(tokens)
+
+            if tuple_tokens in cache:
+                remade_tokens, fixes, multipliers = cache[tuple_tokens]
+            else:
+                fixes = []
+                remade_tokens = []
+                multipliers = []
+                mult = 1.0
+
+                i = 0
+                while i < len(tokens):
+                    token = tokens[i]
+
+                    embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(tokens, i)
+
+                    mult_change = self.token_mults.get(token) if opts.enable_emphasis else None
+                    if mult_change is not None:
+                        mult *= mult_change
+                        i += 1
+                    elif embedding is None:
+                        remade_tokens.append(token)
+                        multipliers.append(mult)
+                        i += 1
+                    else:
+                        emb_len = int(embedding.vec.shape[0])
+                        fixes.append((len(remade_tokens), embedding))
+                        remade_tokens += [0] * emb_len
+                        multipliers += [mult] * emb_len
+                        used_custom_terms.append((embedding.name, embedding.checksum()))
+                        i += embedding_length_in_tokens
+
+                if len(remade_tokens) > maxlen - 2:
+                    vocab = {v: k for k, v in self.wrapped.tokenizer.get_vocab().items()}
+                    ovf = remade_tokens[maxlen - 2:]
+                    overflowing_words = [vocab.get(int(x), "") for x in ovf]
+                    overflowing_text = self.wrapped.tokenizer.convert_tokens_to_string(''.join(overflowing_words))
+                    hijack_comments.append(f"Warning: too many input tokens; some ({len(overflowing_words)}) have been truncated:\n{overflowing_text}\n")
+
+                token_count = len(remade_tokens)
+                remade_tokens = remade_tokens + [id_end] * (maxlen - 2 - len(remade_tokens))
+                remade_tokens = [id_start] + remade_tokens[0:maxlen - 2] + [id_end]
+                cache[tuple_tokens] = (remade_tokens, fixes, multipliers)
+
+            multipliers = multipliers + [1.0] * (maxlen - 2 - len(multipliers))
+            multipliers = [1.0] + multipliers[0:maxlen - 2] + [1.0]
+
+            remade_batch_tokens.append(remade_tokens)
+            hijack_fixes.append(fixes)
+            batch_multipliers.append(multipliers)
+        return batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count
+
+    def forward(self, text):
+        use_old = opts.use_old_emphasis_implementation
+        if use_old:
+            batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text_old(text)
+        else:
+            batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text(text)
+
+        self.hijack.comments += hijack_comments
+
+        if len(used_custom_terms) > 0:
+            self.hijack.comments.append("Used embeddings: " + ", ".join([f'{word} [{checksum}]' for word, checksum in used_custom_terms]))
+
+        if use_old:
+            self.hijack.fixes = hijack_fixes
+            return self.process_tokens(remade_batch_tokens, batch_multipliers)
+
+        z = None
+        i = 0
+        while max(map(len, remade_batch_tokens)) != 0:
+            rem_tokens = [x[75:] for x in remade_batch_tokens]
+            rem_multipliers = [x[75:] for x in batch_multipliers]
+
+            self.hijack.fixes = []
+            for unfiltered in hijack_fixes:
+                fixes = []
+                for fix in unfiltered:
+                    if fix[0] == i:
+                        fixes.append(fix[1])
+                self.hijack.fixes.append(fixes)
+
+            tokens = []
+            multipliers = []
+            for j in range(len(remade_batch_tokens)):
+                if len(remade_batch_tokens[j]) > 0:
+                    tokens.append(remade_batch_tokens[j][:75])
+                    multipliers.append(batch_multipliers[j][:75])
+                else:
+                    tokens.append([self.id_end] * 75)
+                    multipliers.append([1.0] * 75)
+
+            z1 = self.process_tokens(tokens, multipliers)
+            z = z1 if z is None else torch.cat((z, z1), axis=-2)
+
+            remade_batch_tokens = rem_tokens
+            batch_multipliers = rem_multipliers
+            i += 1
+
+        return z
+
+    def process_tokens(self, remade_batch_tokens, batch_multipliers):
+        if not opts.use_old_emphasis_implementation:
+            remade_batch_tokens = [[self.id_start] + x[:75] + [self.id_end] for x in remade_batch_tokens]
+            batch_multipliers = [[1.0] + x[:75] + [1.0] for x in batch_multipliers]
+
+        tokens = torch.asarray(remade_batch_tokens).to(devices.device)
+
+        if self.id_end != self.id_pad:
+            for batch_pos in range(len(remade_batch_tokens)):
+                index = remade_batch_tokens[batch_pos].index(self.id_end)
+                tokens[batch_pos, index+1:tokens.shape[1]] = self.id_pad
+
+        z = self.encode_with_transformers(tokens)
+
+        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
+        batch_multipliers_of_same_length = [x + [1.0] * (75 - len(x)) for x in batch_multipliers]
+        batch_multipliers = torch.asarray(batch_multipliers_of_same_length).to(devices.device)
+        original_mean = z.mean()
+        z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
+        new_mean = z.mean()
+        z *= original_mean / new_mean
+
+        return z
+
+
+class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase):
+    def __init__(self, wrapped, hijack):
+        super().__init__(wrapped, hijack)
+        self.tokenizer = wrapped.tokenizer
+
+        vocab = self.tokenizer.get_vocab()
+
+        self.comma_token = vocab.get(',</w>', None)
+
+        self.token_mults = {}
+        tokens_with_parens = [(k, v) for k, v in vocab.items() if '(' in k or ')' in k or '[' in k or ']' in k]
+        for text, ident in tokens_with_parens:
+            mult = 1.0
+            for c in text:
+                if c == '[':
+                    mult /= 1.1
+                if c == ']':
+                    mult *= 1.1
+                if c == '(':
+                    mult *= 1.1
+                if c == ')':
+                    mult /= 1.1
+
+            if mult != 1.0:
+                self.token_mults[ident] = mult
+
+        self.id_start = self.wrapped.tokenizer.bos_token_id
+        self.id_end = self.wrapped.tokenizer.eos_token_id
+        self.id_pad = self.id_end
+
+    def tokenize(self, texts):
+        tokenized = self.wrapped.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+
+        return tokenized
+
+    def encode_with_transformers(self, tokens):
+        outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers)
+
+        if opts.CLIP_stop_at_last_layers > 1:
+            z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
+            z = self.wrapped.transformer.text_model.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
+
+        return z
+
+    def encode_embedding_init_text(self, init_text, nvpt):
+        embedding_layer = self.wrapped.transformer.text_model.embeddings
+        ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pt", add_special_tokens=False)["input_ids"]
+        embedded = embedding_layer.token_embedding.wrapped(ids.to(embedding_layer.token_embedding.wrapped.weight.device)).squeeze(0)
+
+        return embedded
--- a/modules/sd_hijack_inpainting.py
+++ b/modules/sd_hijack_inpainting.py
@ -1,3 +1,4 @@
+import os
 import torch

 from einops import repeat
@ -11,196 +12,11 @@ from ldm.models.diffusion.ddpm import LatentDiffusion
 from ldm.models.diffusion.plms import PLMSSampler
 from ldm.models.diffusion.ddim import DDIMSampler, noise_like

-# =================================================================================================
-# Monkey patch DDIMSampler methods from RunwayML repo directly.
-# Adapted from:
-# https://github.com/runwayml/stable-diffusion/blob/main/ldm/models/diffusion/ddim.py
-# =================================================================================================
-@torch.no_grad()
-def sample_ddim(self,
-            S,
-            batch_size,
-            shape,
-            conditioning=None,
-            callback=None,
-            normals_sequence=None,
-            img_callback=None,
-            quantize_x0=False,
-            eta=0.,
-            mask=None,
-            x0=None,
-            temperature=1.,
-            noise_dropout=0.,
-            score_corrector=None,
-            corrector_kwargs=None,
-            verbose=True,
-            x_T=None,
-            log_every_t=100,
-            unconditional_guidance_scale=1.,
-            unconditional_conditioning=None,
-            # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-            **kwargs
-            ):
-    if conditioning is not None:
-        if isinstance(conditioning, dict):
-            ctmp = conditioning[list(conditioning.keys())[0]]
-            while isinstance(ctmp, list):
-                ctmp = ctmp[0]
-            cbs = ctmp.shape[0]
-            if cbs != batch_size:
-                print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-        else:
-            if conditioning.shape[0] != batch_size:
-                print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-    self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-    # sampling
-    C, H, W = shape
-    size = (batch_size, C, H, W)
-    print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-
-    samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                callback=callback,
-                                                img_callback=img_callback,
-                                                quantize_denoised=quantize_x0,
-                                                mask=mask, x0=x0,
-                                                ddim_use_original_steps=False,
-                                                noise_dropout=noise_dropout,
-                                                temperature=temperature,
-                                                score_corrector=score_corrector,
-                                                corrector_kwargs=corrector_kwargs,
-                                                x_T=x_T,
-                                                log_every_t=log_every_t,
-                                                unconditional_guidance_scale=unconditional_guidance_scale,
-                                                unconditional_conditioning=unconditional_conditioning,
-                                                )
-    return samples, intermediates
-
-@torch.no_grad()
-def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                    temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                    unconditional_guidance_scale=1., unconditional_conditioning=None):
-    b, *_, device = *x.shape, x.device
-
-    if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-        e_t = self.model.apply_model(x, t, c)
-    else:
-        x_in = torch.cat([x] * 2)
-        t_in = torch.cat([t] * 2)
-        if isinstance(c, dict):
-            assert isinstance(unconditional_conditioning, dict)
-            c_in = dict()
-            for k in c:
-                if isinstance(c[k], list):
-                    c_in[k] = [
-                        torch.cat([unconditional_conditioning[k][i], c[k][i]])
-                        for i in range(len(c[k]))
-                    ]
-                else:
-                    c_in[k] = torch.cat([unconditional_conditioning[k], c[k]])
-        else:
-            c_in = torch.cat([unconditional_conditioning, c])
-        e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-        e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-    if score_corrector is not None:
-        assert self.model.parameterization == "eps"
-        e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-    alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-    alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-    sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-    sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-    # select parameters corresponding to the currently considered timestep
-    a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-    a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-    sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-    sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-    # current prediction for x_0
-    pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-    if quantize_denoised:
-        pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-    # direction pointing to x_t
-    dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-    noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-    if noise_dropout > 0.:
-        noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-    x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-    return x_prev, pred_x0
-
-
-# =================================================================================================
-# Monkey patch PLMSSampler methods.
-# This one was not actually patched correctly in the RunwayML repo, but we can replicate the changes.
-# Adapted from:
-# https://github.com/CompVis/stable-diffusion/blob/main/ldm/models/diffusion/plms.py
-# =================================================================================================
-@torch.no_grad()
-def sample_plms(self,
-            S,
-            batch_size,
-            shape,
-            conditioning=None,
-            callback=None,
-            normals_sequence=None,
-            img_callback=None,
-            quantize_x0=False,
-            eta=0.,
-            mask=None,
-            x0=None,
-            temperature=1.,
-            noise_dropout=0.,
-            score_corrector=None,
-            corrector_kwargs=None,
-            verbose=True,
-            x_T=None,
-            log_every_t=100,
-            unconditional_guidance_scale=1.,
-            unconditional_conditioning=None,
-            # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-            **kwargs
-            ):
-    if conditioning is not None:
-        if isinstance(conditioning, dict):
-            ctmp = conditioning[list(conditioning.keys())[0]]
-            while isinstance(ctmp, list):
-                ctmp = ctmp[0]
-            cbs = ctmp.shape[0]
-            if cbs != batch_size:
-                print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-        else:
-            if conditioning.shape[0] != batch_size:
-                print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-    self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-    # sampling
-    C, H, W = shape
-    size = (batch_size, C, H, W)
-    print(f'Data shape for PLMS sampling is {size}')
-
-    samples, intermediates = self.plms_sampling(conditioning, size,
-                                                callback=callback,
-                                                img_callback=img_callback,
-                                                quantize_denoised=quantize_x0,
-                                                mask=mask, x0=x0,
-                                                ddim_use_original_steps=False,
-                                                noise_dropout=noise_dropout,
-                                                temperature=temperature,
-                                                score_corrector=score_corrector,
-                                                corrector_kwargs=corrector_kwargs,
-                                                x_T=x_T,
-                                                log_every_t=log_every_t,
-                                                unconditional_guidance_scale=unconditional_guidance_scale,
-                                                unconditional_conditioning=unconditional_conditioning,
-                                                )
-    return samples, intermediates
-

@torch.no_grad()
 def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                    temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                    unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+                  temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                  unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None, dynamic_threshold=None):
    b, *_, device = *x.shape, x.device

    def get_model_output(x, t):
@ -209,7 +25,7 @@ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=F
        else:
            x_in = torch.cat([x] * 2)
            t_in = torch.cat([t] * 2)
-            
+
            if isinstance(c, dict):
                assert isinstance(unconditional_conditioning, dict)
                c_in = dict()
@ -249,6 +65,8 @@ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=F
        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
        if quantize_denoised:
            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        if dynamic_threshold is not None:
+            pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
        # direction pointing to x_t
        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
@ -276,56 +94,18 @@ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=F
    x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)

    return x_prev, pred_x0, e_t
-    
-# =================================================================================================
-# Monkey patch LatentInpaintDiffusion to load the checkpoint with a proper config.
-# Adapted from:
-# https://github.com/runwayml/stable-diffusion/blob/main/ldm/models/diffusion/ddpm.py
-# =================================================================================================
-
-@torch.no_grad()
-def get_unconditional_conditioning(self, batch_size, null_label=None):
-    if null_label is not None:
-        xc = null_label
-        if isinstance(xc, ListConfig):
-            xc = list(xc)
-        if isinstance(xc, dict) or isinstance(xc, list):
-            c = self.get_learned_conditioning(xc)
-        else:
-            if hasattr(xc, "to"):
-                xc = xc.to(self.device)
-            c = self.get_learned_conditioning(xc)
-    else:
-        # todo: get null label from cond_stage_model
-        raise NotImplementedError()
-    c = repeat(c, "1 ... -> b ...", b=batch_size).to(self.device)
-    return c
-
-
-class LatentInpaintDiffusion(LatentDiffusion):
-    def __init__(
-        self,
-        concat_keys=("mask", "masked_image"),
-        masked_image_key="masked_image",
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.masked_image_key = masked_image_key
-        assert self.masked_image_key in concat_keys
-        self.concat_keys = concat_keys


 def should_hijack_inpainting(checkpoint_info):
-    return str(checkpoint_info.filename).endswith("inpainting.ckpt") and not checkpoint_info.config.endswith("inpainting.yaml")
+    from modules import sd_models
+
+    ckpt_basename = os.path.basename(checkpoint_info.filename).lower()
+    cfg_basename = os.path.basename(sd_models.find_checkpoint_config(checkpoint_info)).lower()
+
+    return "inpainting" in ckpt_basename and not "inpainting" in cfg_basename


 def do_inpainting_hijack():
-    ldm.models.diffusion.ddpm.get_unconditional_conditioning = get_unconditional_conditioning
-    ldm.models.diffusion.ddpm.LatentInpaintDiffusion = LatentInpaintDiffusion
-
-    ldm.models.diffusion.ddim.DDIMSampler.p_sample_ddim = p_sample_ddim
-    ldm.models.diffusion.ddim.DDIMSampler.sample = sample_ddim
+    # p_sample_plms is needed because PLMS can't work with dicts as conditionings

    ldm.models.diffusion.plms.PLMSSampler.p_sample_plms = p_sample_plms
-    ldm.models.diffusion.plms.PLMSSampler.sample = sample_plms
--- a/modules/sd_hijack_open_clip.py
+++ b/modules/sd_hijack_open_clip.py
@ -0,0 +1,37 @@
+import open_clip.tokenizer
+import torch
+
+from modules import sd_hijack_clip, devices
+from modules.shared import opts
+
+tokenizer = open_clip.tokenizer._tokenizer
+
+
+class FrozenOpenCLIPEmbedderWithCustomWords(sd_hijack_clip.FrozenCLIPEmbedderWithCustomWordsBase):
+    def __init__(self, wrapped, hijack):
+        super().__init__(wrapped, hijack)
+
+        self.comma_token = [v for k, v in tokenizer.encoder.items() if k == ',</w>'][0]
+        self.id_start = tokenizer.encoder["<start_of_text>"]
+        self.id_end = tokenizer.encoder["<end_of_text>"]
+        self.id_pad = 0
+
+    def tokenize(self, texts):
+        assert not opts.use_old_emphasis_implementation, 'Old emphasis implementation not supported for Open Clip'
+
+        tokenized = [tokenizer.encode(text) for text in texts]
+
+        return tokenized
+
+    def encode_with_transformers(self, tokens):
+        # set self.wrapped.layer_idx here according to opts.CLIP_stop_at_last_layers
+        z = self.wrapped.encode_with_transformer(tokens)
+
+        return z
+
+    def encode_embedding_init_text(self, init_text, nvpt):
+        ids = tokenizer.encode(init_text)
+        ids = torch.asarray([ids], device=devices.device, dtype=torch.int)
+        embedded = self.wrapped.model.token_embedding.wrapped(ids).squeeze(0)
+
+        return embedded
--- a/modules/sd_hijack_optimizations.py
+++ b/modules/sd_hijack_optimizations.py
@ -127,7 +127,7 @@ def check_for_psutil():

 invokeAI_mps_available = check_for_psutil()

-# -- Taken from https://github.com/invoke-ai/InvokeAI --
+# -- Taken from https://github.com/invoke-ai/InvokeAI and modified --
 if invokeAI_mps_available:
    import psutil
    mem_total_gb = psutil.virtual_memory().total // (1 << 30)
@ -152,14 +152,16 @@ def einsum_op_slice_1(q, k, v, slice_size):
    return r

 def einsum_op_mps_v1(q, k, v):
-    if q.shape[1] <= 4096: # (512x512) max q.shape[1]: 4096
+    if q.shape[0] * q.shape[1] <= 2**16: # (512x512) max q.shape[1]: 4096
        return einsum_op_compvis(q, k, v)
    else:
        slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1]))
+        if slice_size % 4096 == 0:
+            slice_size -= 1
        return einsum_op_slice_1(q, k, v, slice_size)

 def einsum_op_mps_v2(q, k, v):
-    if mem_total_gb > 8 and q.shape[1] <= 4096:
+    if mem_total_gb > 8 and q.shape[0] * q.shape[1] <= 2**16:
        return einsum_op_compvis(q, k, v)
    else:
        return einsum_op_slice_0(q, k, v, 1)
@ -188,7 +190,7 @@ def einsum_op(q, k, v):
        return einsum_op_cuda(q, k, v)

    if q.device.type == 'mps':
-        if mem_total_gb >= 32:
+        if mem_total_gb >= 32 and q.shape[0] % 32 != 0 and q.shape[0] * q.shape[1] < 2**18:
            return einsum_op_mps_v1(q, k, v)
        return einsum_op_mps_v2(q, k, v)

--- a/modules/sd_hijack_unet.py
+++ b/modules/sd_hijack_unet.py
@ -0,0 +1,30 @@
+import torch
+
+
+class TorchHijackForUnet:
+    """
+    This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
+    this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
+    """
+
+    def __getattr__(self, item):
+        if item == 'cat':
+            return self.cat
+
+        if hasattr(torch, item):
+            return getattr(torch, item)
+
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
+
+    def cat(self, tensors, *args, **kwargs):
+        if len(tensors) == 2:
+            a, b = tensors
+            if a.shape[-2:] != b.shape[-2:]:
+                a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")
+
+            tensors = (a, b)
+
+        return torch.cat(tensors, *args, **kwargs)
+
+
+th = TorchHijackForUnet()
--- a/modules/sd_hijack_xlmr.py
+++ b/modules/sd_hijack_xlmr.py
@ -0,0 +1,34 @@
+import open_clip.tokenizer
+import torch
+
+from modules import sd_hijack_clip, devices
+from modules.shared import opts
+
+
+class FrozenXLMREmbedderWithCustomWords(sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords):
+    def __init__(self, wrapped, hijack):
+        super().__init__(wrapped, hijack)
+
+        self.id_start = wrapped.config.bos_token_id
+        self.id_end = wrapped.config.eos_token_id
+        self.id_pad = wrapped.config.pad_token_id
+
+        self.comma_token = self.tokenizer.get_vocab().get(',', None)  # alt diffusion doesn't have </w> bits for comma
+
+    def encode_with_transformers(self, tokens):
+        # there's no CLIP Skip here because all hidden layers have size of 1024 and the last one uses a
+        # trained layer to transform those 1024 into 768 for unet; so you can't choose which transformer
+        # layer to work with - you have to use the last
+
+        attention_mask = (tokens != self.id_pad).to(device=tokens.device, dtype=torch.int64)
+        features = self.wrapped(input_ids=tokens, attention_mask=attention_mask)
+        z = features['projection_state']
+
+        return z
+
+    def encode_embedding_init_text(self, init_text, nvpt):
+        embedding_layer = self.wrapped.roberta.embeddings
+        ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pt", add_special_tokens=False)["input_ids"]
+        embedded = embedding_layer.token_embedding.wrapped(ids.to(devices.device)).squeeze(0)
+
+        return embedded
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -5,7 +5,11 @@ import gc
 from collections import namedtuple
 import torch
 import re
+import safetensors.torch
 from omegaconf import OmegaConf
+from os import mkdir
+from urllib import request
+import ldm.modules.midas as midas

 from ldm.util import instantiate_from_config

@ -16,7 +20,7 @@ from modules.sd_hijack_inpainting import do_inpainting_hijack, should_hijack_inp
 model_dir = "Stable-diffusion"
 model_path = os.path.abspath(os.path.join(models_path, model_dir))

-CheckpointInfo = namedtuple("CheckpointInfo", ['filename', 'title', 'hash', 'model_name', 'config'])
+CheckpointInfo = namedtuple("CheckpointInfo", ['filename', 'title', 'hash', 'model_name'])
 checkpoints_list = {}
 checkpoints_loaded = collections.OrderedDict()

@ -35,6 +39,7 @@ def setup_model():
        os.makedirs(model_path)

    list_models()
+    enable_midas_autodownload()


 def checkpoint_tiles(): 
@ -43,9 +48,17 @@ def checkpoint_tiles():
    return sorted([x.title for x in checkpoints_list.values()], key = alphanumeric_key)


+def find_checkpoint_config(info):
+    config = os.path.splitext(info.filename)[0] + ".yaml"
+    if os.path.exists(config):
+        return config
+
+    return shared.cmd_opts.config
+
+
 def list_models():
    checkpoints_list.clear()
-    model_list = modelloader.load_models(model_path=model_path, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt"])
+    model_list = modelloader.load_models(model_path=model_path, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt", ".safetensors"])

    def modeltitle(path, shorthash):
        abspath = os.path.abspath(path)
@ -68,7 +81,7 @@ def list_models():
    if os.path.exists(cmd_ckpt):
        h = model_hash(cmd_ckpt)
        title, short_model_name = modeltitle(cmd_ckpt, h)
-        checkpoints_list[title] = CheckpointInfo(cmd_ckpt, title, h, short_model_name, shared.cmd_opts.config)
+        checkpoints_list[title] = CheckpointInfo(cmd_ckpt, title, h, short_model_name)
        shared.opts.data['sd_model_checkpoint'] = title
    elif cmd_ckpt is not None and cmd_ckpt != shared.default_sd_model_file:
        print(f"Checkpoint in --ckpt argument not found (Possible it was moved to {model_path}: {cmd_ckpt}", file=sys.stderr)
@ -76,12 +89,7 @@ def list_models():
        h = model_hash(filename)
        title, short_model_name = modeltitle(filename, h)

-        basename, _ = os.path.splitext(filename)
-        config = basename + ".yaml"
-        if not os.path.exists(config):
-            config = shared.cmd_opts.config
-
-        checkpoints_list[title] = CheckpointInfo(filename, title, h, short_model_name, config)
+        checkpoints_list[title] = CheckpointInfo(filename, title, h, short_model_name)


 def get_closet_checkpoint_match(searchString):
@ -106,18 +114,19 @@ def model_hash(filename):

 def select_checkpoint():
    model_checkpoint = shared.opts.sd_model_checkpoint
+        
    checkpoint_info = checkpoints_list.get(model_checkpoint, None)
    if checkpoint_info is not None:
        return checkpoint_info

    if len(checkpoints_list) == 0:
-        print(f"No checkpoints found. When searching for checkpoints, looked at:", file=sys.stderr)
+        print("No checkpoints found. When searching for checkpoints, looked at:", file=sys.stderr)
        if shared.cmd_opts.ckpt is not None:
            print(f" - file {os.path.abspath(shared.cmd_opts.ckpt)}", file=sys.stderr)
        print(f" - directory {model_path}", file=sys.stderr)
        if shared.cmd_opts.ckpt_dir is not None:
            print(f" - directory {os.path.abspath(shared.cmd_opts.ckpt_dir)}", file=sys.stderr)
-        print(f"Can't run without a checkpoint. Find and place a .ckpt file into any of those locations. The program will exit.", file=sys.stderr)
+        print("Can't run without a checkpoint. Find and place a .ckpt file into any of those locations. The program will exit.", file=sys.stderr)
        exit(1)

    checkpoint_info = next(iter(checkpoints_list.values()))
@ -143,8 +152,8 @@ def transform_checkpoint_dict_key(k):


 def get_state_dict_from_checkpoint(pl_sd):
-    if "state_dict" in pl_sd:
-        pl_sd = pl_sd["state_dict"]
+    pl_sd = pl_sd.pop("state_dict", pl_sd)
+    pl_sd.pop("state_dict", None)

    sd = {}
    for k, v in pl_sd.items():
@ -159,27 +168,44 @@ def get_state_dict_from_checkpoint(pl_sd):
    return pl_sd


+def read_state_dict(checkpoint_file, print_global_state=False, map_location=None):
+    _, extension = os.path.splitext(checkpoint_file)
+    if extension.lower() == ".safetensors":
+        device = map_location or shared.weight_load_location
+        if device is None:
+            device = devices.get_cuda_device_string() if torch.cuda.is_available() else "cpu"
+        pl_sd = safetensors.torch.load_file(checkpoint_file, device=device)
+    else:
+        pl_sd = torch.load(checkpoint_file, map_location=map_location or shared.weight_load_location)
+
+    if print_global_state and "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+
+    sd = get_state_dict_from_checkpoint(pl_sd)
+    return sd
+
+
 def load_model_weights(model, checkpoint_info, vae_file="auto"):
    checkpoint_file = checkpoint_info.filename
    sd_model_hash = checkpoint_info.hash

-    if shared.opts.sd_checkpoint_cache > 0 and hasattr(model, "sd_checkpoint_info"):
-        sd_vae.restore_base_vae(model)
-        checkpoints_loaded[model.sd_checkpoint_info] = model.state_dict().copy()
+    cache_enabled = shared.opts.sd_checkpoint_cache > 0

-    vae_file = sd_vae.resolve_vae(checkpoint_file, vae_file=vae_file)
-
-    if checkpoint_info not in checkpoints_loaded:
+    if cache_enabled and checkpoint_info in checkpoints_loaded:
+        # use checkpoint cache
+        print(f"Loading weights [{sd_model_hash}] from cache")
+        model.load_state_dict(checkpoints_loaded[checkpoint_info])
+    else:
+        # load from file
        print(f"Loading weights [{sd_model_hash}] from {checkpoint_file}")

-        pl_sd = torch.load(checkpoint_file, map_location=shared.weight_load_location)
-        if "global_step" in pl_sd:
-            print(f"Global Step: {pl_sd['global_step']}")
-
-        sd = get_state_dict_from_checkpoint(pl_sd)
-        del pl_sd
+        sd = read_state_dict(checkpoint_file)
        model.load_state_dict(sd, strict=False)
        del sd
+        
+        if cache_enabled:
+            # cache newly loaded model
+            checkpoints_loaded[checkpoint_info] = model.state_dict().copy()

        if shared.cmd_opts.opt_channelslast:
            model.to(memory_format=torch.channels_last)
@ -199,29 +225,73 @@ def load_model_weights(model, checkpoint_info, vae_file="auto"):

        model.first_stage_model.to(devices.dtype_vae)

-    else:
-        vae_name = sd_vae.get_filename(vae_file) if vae_file else None
-        vae_message = f" with {vae_name} VAE" if vae_name else ""
-        print(f"Loading weights [{sd_model_hash}]{vae_message} from cache")
-        model.load_state_dict(checkpoints_loaded[checkpoint_info])
-
-    if shared.opts.sd_checkpoint_cache > 0:
-        while len(checkpoints_loaded) > shared.opts.sd_checkpoint_cache:
+    # clean up cache if limit is reached
+    if cache_enabled:
+        while len(checkpoints_loaded) > shared.opts.sd_checkpoint_cache + 1: # we need to count the current model
            checkpoints_loaded.popitem(last=False)  # LRU

    model.sd_model_hash = sd_model_hash
    model.sd_model_checkpoint = checkpoint_file
    model.sd_checkpoint_info = checkpoint_info

+    model.logvar = model.logvar.to(devices.device)  # fix for training
+
+    sd_vae.delete_base_vae()
+    sd_vae.clear_loaded_vae()
+    vae_file = sd_vae.resolve_vae(checkpoint_file, vae_file=vae_file)
    sd_vae.load_vae(model, vae_file)


+def enable_midas_autodownload():
+    """
+    Gives the ldm.modules.midas.api.load_model function automatic downloading.
+
+    When the 512-depth-ema model, and other future models like it, is loaded,
+    it calls midas.api.load_model to load the associated midas depth model.
+    This function applies a wrapper to download the model to the correct
+    location automatically.
+    """
+
+    midas_path = os.path.join(models_path, 'midas')
+
+    # stable-diffusion-stability-ai hard-codes the midas model path to
+    # a location that differs from where other scripts using this model look.
+    # HACK: Overriding the path here.
+    for k, v in midas.api.ISL_PATHS.items():
+        file_name = os.path.basename(v)
+        midas.api.ISL_PATHS[k] = os.path.join(midas_path, file_name)
+
+    midas_urls = {
+        "dpt_large": "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
+        "dpt_hybrid": "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt",
+        "midas_v21": "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21-f6b98070.pt",
+        "midas_v21_small": "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21_small-70d6b9c8.pt",
+    }
+
+    midas.api.load_model_inner = midas.api.load_model
+
+    def load_model_wrapper(model_type):
+        path = midas.api.ISL_PATHS[model_type]
+        if not os.path.exists(path):
+            if not os.path.exists(midas_path):
+                mkdir(midas_path)
+    
+            print(f"Downloading midas model weights for {model_type} to {path}")
+            request.urlretrieve(midas_urls[model_type], path)
+            print(f"{model_type} downloaded")
+
+        return midas.api.load_model_inner(model_type)
+
+    midas.api.load_model = load_model_wrapper
+
+
 def load_model(checkpoint_info=None):
    from modules import lowvram, sd_hijack
    checkpoint_info = checkpoint_info or select_checkpoint()
+    checkpoint_config = find_checkpoint_config(checkpoint_info)

-    if checkpoint_info.config != shared.cmd_opts.config:
-        print(f"Loading config from: {checkpoint_info.config}")
+    if checkpoint_config != shared.cmd_opts.config:
+        print(f"Loading config from: {checkpoint_config}")

    if shared.sd_model:
        sd_hijack.model_hijack.undo_hijack(shared.sd_model)
@ -229,21 +299,25 @@ def load_model(checkpoint_info=None):
        gc.collect()
        devices.torch_gc()

-    sd_config = OmegaConf.load(checkpoint_info.config)
+    sd_config = OmegaConf.load(checkpoint_config)
    
    if should_hijack_inpainting(checkpoint_info):
        # Hardcoded config for now...
        sd_config.model.target = "ldm.models.diffusion.ddpm.LatentInpaintDiffusion"
-        sd_config.model.params.use_ema = False
        sd_config.model.params.conditioning_key = "hybrid"
        sd_config.model.params.unet_config.params.in_channels = 9
+        sd_config.model.params.finetune_keys = None

-        # Create a "fake" config with a different name so that we know to unload it when switching models.
-        checkpoint_info = checkpoint_info._replace(config=checkpoint_info.config.replace(".yaml", "-inpainting.yaml"))
+    if not hasattr(sd_config.model.params, "use_ema"):
+        sd_config.model.params.use_ema = False

    do_inpainting_hijack()

+    if shared.cmd_opts.no_half:
+        sd_config.model.params.unet_config.params.use_fp16 = False
+
    sd_model = instantiate_from_config(sd_config.model)
+
    load_model_weights(sd_model, checkpoint_info)

    if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
@ -256,23 +330,29 @@ def load_model(checkpoint_info=None):
    sd_model.eval()
    shared.sd_model = sd_model

+    sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings(force_reload=True)  # Reload embeddings after model load as they may or may not fit the model
+
    script_callbacks.model_loaded_callback(sd_model)

-    print(f"Model loaded.")
+    print("Model loaded.")
+
    return sd_model


 def reload_model_weights(sd_model=None, info=None):
    from modules import lowvram, devices, sd_hijack
    checkpoint_info = info or select_checkpoint()
- 
+
    if not sd_model:
        sd_model = shared.sd_model

+    current_checkpoint_info = sd_model.sd_checkpoint_info
+    checkpoint_config = find_checkpoint_config(current_checkpoint_info)
+
    if sd_model.sd_model_checkpoint == checkpoint_info.filename:
        return

-    if sd_model.sd_checkpoint_info.config != checkpoint_info.config or should_hijack_inpainting(checkpoint_info) != should_hijack_inpainting(sd_model.sd_checkpoint_info):
+    if checkpoint_config != find_checkpoint_config(checkpoint_info) or should_hijack_inpainting(checkpoint_info) != should_hijack_inpainting(sd_model.sd_checkpoint_info):
        del sd_model
        checkpoints_loaded.clear()
        load_model(checkpoint_info)
@ -285,13 +365,19 @@ def reload_model_weights(sd_model=None, info=None):

    sd_hijack.model_hijack.undo_hijack(sd_model)

-    load_model_weights(sd_model, checkpoint_info)
+    try:
+        load_model_weights(sd_model, checkpoint_info)
+    except Exception as e:
+        print("Failed to load checkpoint, restoring previous")
+        load_model_weights(sd_model, current_checkpoint_info)
+        raise
+    finally:
+        sd_hijack.model_hijack.hijack(sd_model)
+        script_callbacks.model_loaded_callback(sd_model)

-    sd_hijack.model_hijack.hijack(sd_model)
-    script_callbacks.model_loaded_callback(sd_model)
+        if not shared.cmd_opts.lowvram and not shared.cmd_opts.medvram:
+            sd_model.to(devices.device)

-    if not shared.cmd_opts.lowvram and not shared.cmd_opts.medvram:
-        sd_model.to(devices.device)
+    print("Weights loaded.")

-    print(f"Weights loaded.")
    return sd_model
--- a/modules/sd_samplers.py
+++ b/modules/sd_samplers.py
@ -1,4 +1,4 @@
-from collections import namedtuple
+from collections import namedtuple, deque
 import numpy as np
 from math import floor
 import torch
@ -6,9 +6,10 @@ import tqdm
 from PIL import Image
 import inspect
 import k_diffusion.sampling
+import torchsde._brownian.brownian_interval
 import ldm.models.diffusion.ddim
 import ldm.models.diffusion.plms
-from modules import prompt_parser, devices, processing, images
+from modules import prompt_parser, devices, processing, images, sd_vae_approx

 from modules.shared import opts, cmd_opts, state
 import modules.shared as shared
@ -18,21 +19,23 @@ from modules.script_callbacks import CFGDenoiserParams, cfg_denoiser_callback
 SamplerData = namedtuple('SamplerData', ['name', 'constructor', 'aliases', 'options'])

 samplers_k_diffusion = [
-    ('Euler a', 'sample_euler_ancestral', ['k_euler_a'], {}),
+    ('Euler a', 'sample_euler_ancestral', ['k_euler_a', 'k_euler_ancestral'], {}),
    ('Euler', 'sample_euler', ['k_euler'], {}),
    ('LMS', 'sample_lms', ['k_lms'], {}),
    ('Heun', 'sample_heun', ['k_heun'], {}),
-    ('DPM2', 'sample_dpm_2', ['k_dpm_2'], {}),
-    ('DPM2 a', 'sample_dpm_2_ancestral', ['k_dpm_2_a'], {}),
+    ('DPM2', 'sample_dpm_2', ['k_dpm_2'], {'discard_next_to_last_sigma': True}),
+    ('DPM2 a', 'sample_dpm_2_ancestral', ['k_dpm_2_a'], {'discard_next_to_last_sigma': True}),
    ('DPM++ 2S a', 'sample_dpmpp_2s_ancestral', ['k_dpmpp_2s_a'], {}),
    ('DPM++ 2M', 'sample_dpmpp_2m', ['k_dpmpp_2m'], {}),
+    ('DPM++ SDE', 'sample_dpmpp_sde', ['k_dpmpp_sde'], {}),
    ('DPM fast', 'sample_dpm_fast', ['k_dpm_fast'], {}),
    ('DPM adaptive', 'sample_dpm_adaptive', ['k_dpm_ad'], {}),
    ('LMS Karras', 'sample_lms', ['k_lms_ka'], {'scheduler': 'karras'}),
-    ('DPM2 Karras', 'sample_dpm_2', ['k_dpm_2_ka'], {'scheduler': 'karras'}),
-    ('DPM2 a Karras', 'sample_dpm_2_ancestral', ['k_dpm_2_a_ka'], {'scheduler': 'karras'}),
+    ('DPM2 Karras', 'sample_dpm_2', ['k_dpm_2_ka'], {'scheduler': 'karras', 'discard_next_to_last_sigma': True}),
+    ('DPM2 a Karras', 'sample_dpm_2_ancestral', ['k_dpm_2_a_ka'], {'scheduler': 'karras', 'discard_next_to_last_sigma': True}),
    ('DPM++ 2S a Karras', 'sample_dpmpp_2s_ancestral', ['k_dpmpp_2s_a_ka'], {'scheduler': 'karras'}),
    ('DPM++ 2M Karras', 'sample_dpmpp_2m', ['k_dpmpp_2m_ka'], {'scheduler': 'karras'}),
+    ('DPM++ SDE Karras', 'sample_dpmpp_sde', ['k_dpmpp_sde_ka'], {'scheduler': 'karras'}),
 ]

 samplers_data_k_diffusion = [
@ -46,16 +49,24 @@ all_samplers = [
    SamplerData('DDIM', lambda model: VanillaStableDiffusionSampler(ldm.models.diffusion.ddim.DDIMSampler, model), [], {}),
    SamplerData('PLMS', lambda model: VanillaStableDiffusionSampler(ldm.models.diffusion.plms.PLMSSampler, model), [], {}),
 ]
+all_samplers_map = {x.name: x for x in all_samplers}

 samplers = []
 samplers_for_img2img = []
+samplers_map = {}


-def create_sampler_with_index(list_of_configs, index, model):
-    config = list_of_configs[index]
+def create_sampler(name, model):
+    if name is not None:
+        config = all_samplers_map.get(name, None)
+    else:
+        config = all_samplers[0]
+
+    assert config is not None, f'bad sampler name: {name}'
+
    sampler = config.constructor(model)
    sampler.config = config
-    
+
    return sampler


@ -68,6 +79,12 @@ def set_samplers():
    samplers = [x for x in all_samplers if x.name not in hidden]
    samplers_for_img2img = [x for x in all_samplers if x.name not in hidden_img2img]

+    samplers_map.clear()
+    for sampler in all_samplers:
+        samplers_map[sampler.name.lower()] = sampler.name
+        for alias in sampler.aliases:
+            samplers_map[alias.lower()] = sampler.name
+

 set_samplers()

@ -89,20 +106,32 @@ def setup_img2img_steps(p, steps=None):
    return steps, t_enc


-def single_sample_to_image(sample):
-    x_sample = processing.decode_first_stage(shared.sd_model, sample.unsqueeze(0))[0]
+approximation_indexes = {"Full": 0, "Approx NN": 1, "Approx cheap": 2}
+
+
+def single_sample_to_image(sample, approximation=None):
+    if approximation is None:
+        approximation = approximation_indexes.get(opts.show_progress_type, 0)
+
+    if approximation == 2:
+        x_sample = sd_vae_approx.cheap_approximation(sample)
+    elif approximation == 1:
+        x_sample = sd_vae_approx.model()(sample.to(devices.device, devices.dtype).unsqueeze(0))[0].detach()
+    else:
+        x_sample = processing.decode_first_stage(shared.sd_model, sample.unsqueeze(0))[0]
+
    x_sample = torch.clamp((x_sample + 1.0) / 2.0, min=0.0, max=1.0)
    x_sample = 255. * np.moveaxis(x_sample.cpu().numpy(), 0, 2)
    x_sample = x_sample.astype(np.uint8)
    return Image.fromarray(x_sample)


-def sample_to_image(samples, index=0):
-    return single_sample_to_image(samples[index])
+def sample_to_image(samples, index=0, approximation=None):
+    return single_sample_to_image(samples[index], approximation)


-def samples_to_image_grid(samples):
-    return images.image_grid([single_sample_to_image(sample) for sample in samples])
+def samples_to_image_grid(samples, approximation=None):
+    return images.image_grid([single_sample_to_image(sample, approximation) for sample in samples])


 def store_latent(decoded):
@ -120,7 +149,8 @@ class InterruptedException(BaseException):
 class VanillaStableDiffusionSampler:
    def __init__(self, constructor, sd_model):
        self.sampler = constructor(sd_model)
-        self.orig_p_sample_ddim = self.sampler.p_sample_ddim if hasattr(self.sampler, 'p_sample_ddim') else self.sampler.p_sample_plms
+        self.is_plms = hasattr(self.sampler, 'p_sample_plms')
+        self.orig_p_sample_ddim = self.sampler.p_sample_plms if self.is_plms else self.sampler.p_sample_ddim
        self.mask = None
        self.nmask = None
        self.init_latent = None
@ -211,7 +241,6 @@ class VanillaStableDiffusionSampler:
        self.mask = p.mask if hasattr(p, 'mask') else None
        self.nmask = p.nmask if hasattr(p, 'nmask') else None

-
    def adjust_steps_if_invalid(self, p, num_steps):
        if  (self.config.name == 'DDIM' and p.ddim_discretize == 'uniform') or (self.config.name == 'PLMS'):
            valid_step = 999 / (1000 // num_steps)
@ -220,7 +249,6 @@ class VanillaStableDiffusionSampler:
        
        return num_steps

-
    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
        steps, t_enc = setup_img2img_steps(p, steps)
        steps = self.adjust_steps_if_invalid(p, steps)
@ -253,9 +281,10 @@ class VanillaStableDiffusionSampler:
        steps = self.adjust_steps_if_invalid(p, steps or p.steps)

        # Wrap the conditioning models with additional image conditioning for inpainting model
+        # dummy_for_plms is needed because PLMS code checks the first item in the dict to have the right shape
        if image_conditioning is not None:
-            conditioning = {"c_concat": [image_conditioning], "c_crossattn": [conditioning]}
-            unconditional_conditioning = {"c_concat": [image_conditioning], "c_crossattn": [unconditional_conditioning]}
+            conditioning = {"dummy_for_plms": np.zeros((conditioning.shape[0],)), "c_crossattn": [conditioning], "c_concat": [image_conditioning]}
+            unconditional_conditioning = {"c_crossattn": [unconditional_conditioning], "c_concat": [image_conditioning]}

        samples_ddim = self.launch_sampling(steps, lambda: self.sampler.sample(S=steps, conditioning=conditioning, batch_size=int(x.shape[0]), shape=x[0].shape, verbose=False, unconditional_guidance_scale=p.cfg_scale, unconditional_conditioning=unconditional_conditioning, x_T=x, eta=self.eta)[0])

@ -271,6 +300,16 @@ class CFGDenoiser(torch.nn.Module):
        self.init_latent = None
        self.step = 0

+    def combine_denoised(self, x_out, conds_list, uncond, cond_scale):
+        denoised_uncond = x_out[-uncond.shape[0]:]
+        denoised = torch.clone(denoised_uncond)
+
+        for i, conds in enumerate(conds_list):
+            for cond_index, weight in conds:
+                denoised[i] += (x_out[cond_index] - denoised_uncond[i]) * (weight * cond_scale)
+
+        return denoised
+
    def forward(self, x, sigma, uncond, cond, cond_scale, image_cond):
        if state.interrupted or state.skipped:
            raise InterruptedException
@ -312,12 +351,7 @@ class CFGDenoiser(torch.nn.Module):

            x_out[-uncond.shape[0]:] = self.inner_model(x_in[-uncond.shape[0]:], sigma_in[-uncond.shape[0]:], cond={"c_crossattn": [uncond], "c_concat": [image_cond_in[-uncond.shape[0]:]]})

-        denoised_uncond = x_out[-uncond.shape[0]:]
-        denoised = torch.clone(denoised_uncond)
-
-        for i, conds in enumerate(conds_list):
-            for cond_index, weight in conds:
-                denoised[i] += (x_out[cond_index] - denoised_uncond[i]) * (weight * cond_scale)
+        denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale)

        if self.mask is not None:
            denoised = self.init_latent * self.mask + self.nmask * denoised
@ -328,28 +362,55 @@ class CFGDenoiser(torch.nn.Module):


 class TorchHijack:
-    def __init__(self, kdiff_sampler):
-        self.kdiff_sampler = kdiff_sampler
+    def __init__(self, sampler_noises):
+        # Using a deque to efficiently receive the sampler_noises in the same order as the previous index-based
+        # implementation.
+        self.sampler_noises = deque(sampler_noises)

    def __getattr__(self, item):
        if item == 'randn_like':
-            return self.kdiff_sampler.randn_like
+            return self.randn_like

        if hasattr(torch, item):
            return getattr(torch, item)

        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))

+    def randn_like(self, x):
+        if self.sampler_noises:
+            noise = self.sampler_noises.popleft()
+            if noise.shape == x.shape:
+                return noise
+
+        if x.device.type == 'mps':
+            return torch.randn_like(x, device=devices.cpu).to(x.device)
+        else:
+            return torch.randn_like(x)
+
+
+# MPS fix for randn in torchsde
+def torchsde_randn(size, dtype, device, seed):
+    if device.type == 'mps':
+        generator = torch.Generator(devices.cpu).manual_seed(int(seed))
+        return torch.randn(size, dtype=dtype, device=devices.cpu, generator=generator).to(device)
+    else:
+        generator = torch.Generator(device).manual_seed(int(seed))
+        return torch.randn(size, dtype=dtype, device=device, generator=generator)
+
+
+torchsde._brownian.brownian_interval._randn = torchsde_randn
+

 class KDiffusionSampler:
    def __init__(self, funcname, sd_model):
-        self.model_wrap = k_diffusion.external.CompVisDenoiser(sd_model, quantize=shared.opts.enable_quantization)
+        denoiser = k_diffusion.external.CompVisVDenoiser if sd_model.parameterization == "v" else k_diffusion.external.CompVisDenoiser
+
+        self.model_wrap = denoiser(sd_model, quantize=shared.opts.enable_quantization)
        self.funcname = funcname
        self.func = getattr(k_diffusion.sampling, self.funcname)
        self.extra_params = sampler_extra_params.get(funcname, [])
        self.model_wrap_cfg = CFGDenoiser(self.model_wrap)
        self.sampler_noises = None
-        self.sampler_noise_index = 0
        self.stop_at = None
        self.eta = None
        self.default_eta = 1.0
@ -382,26 +443,13 @@ class KDiffusionSampler:
    def number_of_needed_noises(self, p):
        return p.steps

-    def randn_like(self, x):
-        noise = self.sampler_noises[self.sampler_noise_index] if self.sampler_noises is not None and self.sampler_noise_index < len(self.sampler_noises) else None
-
-        if noise is not None and x.shape == noise.shape:
-            res = noise
-        else:
-            res = torch.randn_like(x)
-
-        self.sampler_noise_index += 1
-        return res
-
    def initialize(self, p):
        self.model_wrap_cfg.mask = p.mask if hasattr(p, 'mask') else None
        self.model_wrap_cfg.nmask = p.nmask if hasattr(p, 'nmask') else None
        self.model_wrap.step = 0
-        self.sampler_noise_index = 0
        self.eta = p.eta or opts.eta_ancestral

-        if self.sampler_noises is not None:
-            k_diffusion.sampling.torch = TorchHijack(self)
+        k_diffusion.sampling.torch = TorchHijack(self.sampler_noises if self.sampler_noises is not None else [])

        extra_params_kwargs = {}
        for param_name in self.extra_params:
@ -413,16 +461,26 @@ class KDiffusionSampler:

        return extra_params_kwargs

-    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
-        steps, t_enc = setup_img2img_steps(p, steps)
-
+    def get_sigmas(self, p, steps):
        if p.sampler_noise_scheduler_override:
            sigmas = p.sampler_noise_scheduler_override(steps)
        elif self.config is not None and self.config.options.get('scheduler', None) == 'karras':
-            sigmas = k_diffusion.sampling.get_sigmas_karras(n=steps, sigma_min=0.1, sigma_max=10, device=shared.device)
+            sigma_min, sigma_max = (0.1, 10) if opts.use_old_karras_scheduler_sigmas else (self.model_wrap.sigmas[0].item(), self.model_wrap.sigmas[-1].item())
+
+            sigmas = k_diffusion.sampling.get_sigmas_karras(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, device=shared.device)
        else:
            sigmas = self.model_wrap.get_sigmas(steps)

+        if self.config is not None and self.config.options.get('discard_next_to_last_sigma', False):
+            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
+
+        return sigmas
+
+    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
+        steps, t_enc = setup_img2img_steps(p, steps)
+
+        sigmas = self.get_sigmas(p, steps)
+
        sigma_sched = sigmas[steps - t_enc - 1:]
        xi = x + noise * sigma_sched[0]
        
@ -454,12 +512,7 @@ class KDiffusionSampler:
    def sample(self, p, x, conditioning, unconditional_conditioning, steps=None, image_conditioning = None):
        steps = steps or p.steps

-        if p.sampler_noise_scheduler_override:
-            sigmas = p.sampler_noise_scheduler_override(steps)
-        elif self.config is not None and self.config.options.get('scheduler', None) == 'karras':
-            sigmas = k_diffusion.sampling.get_sigmas_karras(n=steps, sigma_min=0.1, sigma_max=10, device=shared.device)
-        else:
-            sigmas = self.model_wrap.get_sigmas(steps)
+        sigmas = self.get_sigmas(p, steps)

        x = x * sigmas[0]

--- a/modules/sd_vae.py
+++ b/modules/sd_vae.py
@ -1,9 +1,11 @@
 import torch
 import os
+import collections
 from collections import namedtuple
 from modules import shared, devices, script_callbacks
 from modules.paths import models_path
 import glob
+from copy import deepcopy


 model_dir = "Stable-diffusion"
@ -15,7 +17,7 @@ vae_path = os.path.abspath(os.path.join(models_path, vae_dir))
 vae_ignore_keys = {"model_ema.decay", "model_ema.num_updates"}


-default_vae_dict = {"auto": "auto", "None": "None"}
+default_vae_dict = {"auto": "auto", "None": None, None: None}
 default_vae_list = ["auto", "None"]


@ -29,6 +31,7 @@ base_vae = None
 loaded_vae_file = None
 checkpoint_info = None

+checkpoints_loaded = collections.OrderedDict()

 def get_base_vae(model):
    if base_vae is not None and checkpoint_info == model.sd_checkpoint_info and model:
@ -39,7 +42,8 @@ def get_base_vae(model):
 def store_base_vae(model):
    global base_vae, checkpoint_info
    if checkpoint_info != model.sd_checkpoint_info:
-        base_vae = model.first_stage_model.state_dict().copy()
+        assert not loaded_vae_file, "Trying to store non-base VAE!"
+        base_vae = deepcopy(model.first_stage_model.state_dict())
        checkpoint_info = model.sd_checkpoint_info


@ -50,9 +54,11 @@ def delete_base_vae():


 def restore_base_vae(model):
-    global base_vae, checkpoint_info
+    global loaded_vae_file
    if base_vae is not None and checkpoint_info == model.sd_checkpoint_info:
-        load_vae_dict(model, base_vae)
+        print("Restoring base VAE")
+        _load_vae_dict(model, base_vae)
+        loaded_vae_file = None
    delete_base_vae()


@ -83,47 +89,54 @@ def refresh_vae_list(vae_path=vae_path, model_path=model_path):
    return vae_list


-def resolve_vae(checkpoint_file, vae_file="auto"):
-    global first_load, vae_dict, vae_list
-
-    # if vae_file argument is provided, it takes priority, but not saved
-    if vae_file and vae_file not in default_vae_list:
-        if not os.path.isfile(vae_file):
-            vae_file = "auto"
-            print("VAE provided as function argument doesn't exist")
-    # for the first load, if vae-path is provided, it takes priority, saved, and failure is reported
-    if first_load and shared.cmd_opts.vae_path is not None:
-        if os.path.isfile(shared.cmd_opts.vae_path):
-            vae_file = shared.cmd_opts.vae_path
-            shared.opts.data['sd_vae'] = get_filename(vae_file)
-        else:
-            print("VAE provided as command line argument doesn't exist")
-    # else, we load from settings
+def get_vae_from_settings(vae_file="auto"):
+    # else, we load from settings, if not set to be default
    if vae_file == "auto" and shared.opts.sd_vae is not None:
        # if saved VAE settings isn't recognized, fallback to auto
        vae_file = vae_dict.get(shared.opts.sd_vae, "auto")
        # if VAE selected but not found, fallback to auto
        if vae_file not in default_vae_values and not os.path.isfile(vae_file):
            vae_file = "auto"
-            print("Selected VAE doesn't exist")
+            print(f"Selected VAE doesn't exist: {vae_file}")
+    return vae_file
+
+
+def resolve_vae(checkpoint_file=None, vae_file="auto"):
+    global first_load, vae_dict, vae_list
+
+    # if vae_file argument is provided, it takes priority, but not saved
+    if vae_file and vae_file not in default_vae_list:
+        if not os.path.isfile(vae_file):
+            print(f"VAE provided as function argument doesn't exist: {vae_file}")
+            vae_file = "auto"
+    # for the first load, if vae-path is provided, it takes priority, saved, and failure is reported
+    if first_load and shared.cmd_opts.vae_path is not None:
+        if os.path.isfile(shared.cmd_opts.vae_path):
+            vae_file = shared.cmd_opts.vae_path
+            shared.opts.data['sd_vae'] = get_filename(vae_file)
+        else:
+            print(f"VAE provided as command line argument doesn't exist: {vae_file}")
+    # fallback to selector in settings, if vae selector not set to act as default fallback
+    if not shared.opts.sd_vae_as_default:
+        vae_file = get_vae_from_settings(vae_file)
    # vae-path cmd arg takes priority for auto
    if vae_file == "auto" and shared.cmd_opts.vae_path is not None:
        if os.path.isfile(shared.cmd_opts.vae_path):
            vae_file = shared.cmd_opts.vae_path
-            print("Using VAE provided as command line argument")
+            print(f"Using VAE provided as command line argument: {vae_file}")
    # if still not found, try look for ".vae.pt" beside model
    model_path = os.path.splitext(checkpoint_file)[0]
    if vae_file == "auto":
        vae_file_try = model_path + ".vae.pt"
        if os.path.isfile(vae_file_try):
            vae_file = vae_file_try
-            print("Using VAE found beside selected model")
+            print(f"Using VAE found similar to selected model: {vae_file}")
    # if still not found, try look for ".vae.ckpt" beside model
    if vae_file == "auto":
        vae_file_try = model_path + ".vae.ckpt"
        if os.path.isfile(vae_file_try):
            vae_file = vae_file_try
-            print("Using VAE found beside selected model")
+            print(f"Using VAE found similar to selected model: {vae_file}")
    # No more fallbacks for auto
    if vae_file == "auto":
        vae_file = None
@ -138,11 +151,30 @@ def load_vae(model, vae_file=None):
    global first_load, vae_dict, vae_list, loaded_vae_file
    # save_settings = False

+    cache_enabled = shared.opts.sd_vae_checkpoint_cache > 0
+
    if vae_file:
-        print(f"Loading VAE weights from: {vae_file}")
-        vae_ckpt = torch.load(vae_file, map_location=shared.weight_load_location)
-        vae_dict_1 = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss" and k not in vae_ignore_keys}
-        load_vae_dict(model, vae_dict_1)
+        if cache_enabled and vae_file in checkpoints_loaded:
+            # use vae checkpoint cache
+            print(f"Loading VAE weights [{get_filename(vae_file)}] from cache")
+            store_base_vae(model)
+            _load_vae_dict(model, checkpoints_loaded[vae_file])
+        else:
+            assert os.path.isfile(vae_file), f"VAE file doesn't exist: {vae_file}"
+            print(f"Loading VAE weights from: {vae_file}")
+            store_base_vae(model)
+            vae_ckpt = torch.load(vae_file, map_location=shared.weight_load_location)
+            vae_dict_1 = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss" and k not in vae_ignore_keys}
+            _load_vae_dict(model, vae_dict_1)
+
+            if cache_enabled:
+                # cache newly loaded vae
+                checkpoints_loaded[vae_file] = vae_dict_1.copy()
+
+        # clean up cache if limit is reached
+        if cache_enabled:
+            while len(checkpoints_loaded) > shared.opts.sd_vae_checkpoint_cache + 1: # we need to count the current model
+                checkpoints_loaded.popitem(last=False)  # LRU

        # If vae used is not in dict, update it
        # It will be removed on refresh though
@ -150,30 +182,22 @@ def load_vae(model, vae_file=None):
        if vae_opt not in vae_dict:
            vae_dict[vae_opt] = vae_file
            vae_list.append(vae_opt)
+    elif loaded_vae_file:
+        restore_base_vae(model)

    loaded_vae_file = vae_file

-    """
-    # Save current VAE to VAE settings, maybe? will it work?
-    if save_settings:
-        if vae_file is None:
-            vae_opt = "None"
-
-        # shared.opts.sd_vae = vae_opt
-    """
-
    first_load = False


 # don't call this from outside
-def load_vae_dict(model, vae_dict_1=None):
-    if vae_dict_1:
-        store_base_vae(model)
-        model.first_stage_model.load_state_dict(vae_dict_1)
-    else:
-        restore_base_vae()
+def _load_vae_dict(model, vae_dict_1):
+    model.first_stage_model.load_state_dict(vae_dict_1)
    model.first_stage_model.to(devices.dtype_vae)

+def clear_loaded_vae():
+    global loaded_vae_file
+    loaded_vae_file = None

 def reload_vae_weights(sd_model=None, vae_file="auto"):
    from modules import lowvram, devices, sd_hijack
@ -203,5 +227,5 @@ def reload_vae_weights(sd_model=None, vae_file="auto"):
    if not shared.cmd_opts.lowvram and not shared.cmd_opts.medvram:
        sd_model.to(devices.device)

-    print(f"VAE Weights loaded.")
+    print("VAE Weights loaded.")
    return sd_model
--- a/modules/sd_vae_approx.py
+++ b/modules/sd_vae_approx.py
@ -0,0 +1,58 @@
+import os
+
+import torch
+from torch import nn
+from modules import devices, paths
+
+sd_vae_approx_model = None
+
+
+class VAEApprox(nn.Module):
+    def __init__(self):
+        super(VAEApprox, self).__init__()
+        self.conv1 = nn.Conv2d(4, 8, (7, 7))
+        self.conv2 = nn.Conv2d(8, 16, (5, 5))
+        self.conv3 = nn.Conv2d(16, 32, (3, 3))
+        self.conv4 = nn.Conv2d(32, 64, (3, 3))
+        self.conv5 = nn.Conv2d(64, 32, (3, 3))
+        self.conv6 = nn.Conv2d(32, 16, (3, 3))
+        self.conv7 = nn.Conv2d(16, 8, (3, 3))
+        self.conv8 = nn.Conv2d(8, 3, (3, 3))
+
+    def forward(self, x):
+        extra = 11
+        x = nn.functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2))
+        x = nn.functional.pad(x, (extra, extra, extra, extra))
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7, self.conv8, ]:
+            x = layer(x)
+            x = nn.functional.leaky_relu(x, 0.1)
+
+        return x
+
+
+def model():
+    global sd_vae_approx_model
+
+    if sd_vae_approx_model is None:
+        sd_vae_approx_model = VAEApprox()
+        sd_vae_approx_model.load_state_dict(torch.load(os.path.join(paths.models_path, "VAE-approx", "model.pt")))
+        sd_vae_approx_model.eval()
+        sd_vae_approx_model.to(devices.device, devices.dtype)
+
+    return sd_vae_approx_model
+
+
+def cheap_approximation(sample):
+    # https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/2
+
+    coefs = torch.tensor([
+        [0.298, 0.207, 0.208],
+        [0.187, 0.286, 0.173],
+        [-0.158, 0.189, 0.264],
+        [-0.184, -0.271, -0.473],
+    ]).to(sample.device)
+
+    x_sample = torch.einsum("lxy,lr -> rxy", sample, coefs)
+
+    return x_sample
--- a/modules/shared.py
+++ b/modules/shared.py
@ -3,26 +3,27 @@ import datetime
 import json
 import os
 import sys
-from collections import OrderedDict
 import time

+from PIL import Image
 import gradio as gr
 import tqdm

 import modules.artists
 import modules.interrogate
 import modules.memmon
-import modules.sd_models
 import modules.styles
 import modules.devices as devices
-from modules import sd_samplers, sd_models, localization, sd_vae
-from modules.hypernetworks import hypernetwork
+from modules import localization, sd_vae, extensions, script_loading, errors
 from modules.paths import models_path, script_path, sd_path

+
+demo = None
+
 sd_model_file = os.path.join(script_path, 'model.ckpt')
 default_sd_model_file = sd_model_file
 parser = argparse.ArgumentParser()
-parser.add_argument("--config", type=str, default=os.path.join(sd_path, "configs/stable-diffusion/v1-inference.yaml"), help="path to config which constructs model",)
+parser.add_argument("--config", type=str, default=os.path.join(script_path, "configs/v1-inference.yaml"), help="path to config which constructs model",)
 parser.add_argument("--ckpt", type=str, default=sd_model_file, help="path to checkpoint of stable diffusion model; if specified, this checkpoint will be added to the list of checkpoints and loaded",)
 parser.add_argument("--ckpt-dir", type=str, default=None, help="Path to directory with stable diffusion checkpoints")
 parser.add_argument("--gfpgan-dir", type=str, help="GFPGAN directory", default=('./src/gfpgan' if os.path.exists('./src/gfpgan') else './GFPGAN'))
@ -50,18 +51,15 @@ parser.add_argument("--gfpgan-models-path", type=str, help="Path to directory wi
 parser.add_argument("--esrgan-models-path", type=str, help="Path to directory with ESRGAN model file(s).", default=os.path.join(models_path, 'ESRGAN'))
 parser.add_argument("--bsrgan-models-path", type=str, help="Path to directory with BSRGAN model file(s).", default=os.path.join(models_path, 'BSRGAN'))
 parser.add_argument("--realesrgan-models-path", type=str, help="Path to directory with RealESRGAN model file(s).", default=os.path.join(models_path, 'RealESRGAN'))
-parser.add_argument("--scunet-models-path", type=str, help="Path to directory with ScuNET model file(s).", default=os.path.join(models_path, 'ScuNET'))
-parser.add_argument("--swinir-models-path", type=str, help="Path to directory with SwinIR model file(s).", default=os.path.join(models_path, 'SwinIR'))
-parser.add_argument("--ldsr-models-path", type=str, help="Path to directory with LDSR model file(s).", default=os.path.join(models_path, 'LDSR'))
 parser.add_argument("--clip-models-path", type=str, help="Path to directory with CLIP model file(s).", default=None)
 parser.add_argument("--xformers", action='store_true', help="enable xformers for cross attention layers")
 parser.add_argument("--force-enable-xformers", action='store_true', help="enable xformers for cross attention layers regardless of whether the checking code thinks you can run it; do not make bug reports if this fails to work")
-parser.add_argument("--deepdanbooru", action='store_true', help="enable deepdanbooru interrogator")
+parser.add_argument("--deepdanbooru", action='store_true', help="does not do anything")
 parser.add_argument("--opt-split-attention", action='store_true', help="force-enables Doggettx's cross-attention layer optimization. By default, it's on for torch cuda.")
 parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.")
 parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
 parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")
-parser.add_argument("--use-cpu", nargs='+',choices=['all', 'sd', 'interrogate', 'gfpgan', 'swinir', 'esrgan', 'scunet', 'codeformer'], help="use CPU as torch device for specified modules", default=[], type=str.lower)
+parser.add_argument("--use-cpu", nargs='+', help="use CPU as torch device for specified modules", default=[], type=str.lower)
 parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
 parser.add_argument("--port", type=int, help="launch gradio with given server port, you need root/admin rights for ports < 1024, defaults to 7860 if available", default=None)
 parser.add_argument("--show-negative-prompt", action='store_true', help="does not do anything", default=False)
@ -72,6 +70,7 @@ parser.add_argument("--ui-settings-file", type=str, help="filename to use for ui
 parser.add_argument("--gradio-debug",  action='store_true', help="launch gradio with --debug option")
 parser.add_argument("--gradio-auth", type=str, help='set gradio authentication like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3"', default=None)
 parser.add_argument("--gradio-img2img-tool", type=str, help='gradio image uploader tool: can be either editor for ctopping, or color-sketch for drawing', choices=["color-sketch", "editor"], default="editor")
+parser.add_argument("--gradio-inpaint-tool", type=str, choices=["sketch", "color-sketch"], default="sketch", help="gradio inpainting editor: can be either sketch to only blur/noise the input, or color-sketch to paint over it")
 parser.add_argument("--opt-channelslast", action='store_true', help="change memory type for stable diffusion to channels last")
 parser.add_argument("--styles-file", type=str, help="filename to use for styles", default=os.path.join(script_path, 'styles.csv'))
 parser.add_argument("--autolaunch", action='store_true', help="open the webui URL in the system's default browser upon launch", default=False)
@ -81,17 +80,24 @@ parser.add_argument("--disable-console-progressbars", action='store_true', help=
 parser.add_argument("--enable-console-prompts", action='store_true', help="print prompts to console when generating with txt2img and img2img", default=False)
 parser.add_argument('--vae-path', type=str, help='Path to Variational Autoencoders model', default=None)
 parser.add_argument("--disable-safe-unpickle", action='store_true', help="disable checking pytorch models for malicious code", default=False)
-parser.add_argument("--api", action='store_true', help="use api=True to launch the api with the webui")
-parser.add_argument("--nowebui", action='store_true', help="use api=True to launch the api instead of the webui")
+parser.add_argument("--api", action='store_true', help="use api=True to launch the API together with the webui (use --nowebui instead for only the API)")
+parser.add_argument("--api-auth", type=str, help='Set authentication for API like "username:password"; or comma-delimit multiple like "u1:p1,u2:p2,u3:p3"', default=None)
+parser.add_argument("--api-log", action='store_true', help="use api-log=True to enable logging of all API requests")
+parser.add_argument("--nowebui", action='store_true', help="use api=True to launch the API instead of the webui")
 parser.add_argument("--ui-debug-mode", action='store_true', help="Don't load model to quickly launch UI")
 parser.add_argument("--device-id", type=str, help="Select the default CUDA device to use (export CUDA_VISIBLE_DEVICES=0,1,etc might be needed before)", default=None)
 parser.add_argument("--administrator", action='store_true', help="Administrator rights", default=False)
-parser.add_argument("--cors-allow-origins", type=str, help="Allowed CORS origins", default=None)
+parser.add_argument("--cors-allow-origins", type=str, help="Allowed CORS origin(s) in the form of a comma-separated list (no spaces)", default=None)
+parser.add_argument("--cors-allow-origins-regex", type=str, help="Allowed CORS origin(s) in the form of a single regular expression", default=None)
 parser.add_argument("--tls-keyfile", type=str, help="Partially enables TLS, requires --tls-certfile to fully function", default=None)
 parser.add_argument("--tls-certfile", type=str, help="Partially enables TLS, requires --tls-keyfile to fully function", default=None)
 parser.add_argument("--server-name", type=str, help="Sets hostname of server", default=None)

+script_loading.preload_extensions(extensions.extensions_dir, parser)
+script_loading.preload_extensions(extensions.extensions_builtin_dir, parser)
+
 cmd_opts = parser.parse_args()
+
 restricted_opts = {
    "samples_filename_pattern",
    "directories_filename_pattern",
@ -104,10 +110,21 @@ restricted_opts = {
    "outdir_save",
 }

-cmd_opts.disable_extension_access = (cmd_opts.share or cmd_opts.listen) and not cmd_opts.enable_insecure_extension_access
+ui_reorder_categories = [
+    "sampler",
+    "dimensions",
+    "cfg",
+    "seed",
+    "checkboxes",
+    "hires_fix",
+    "batch",
+    "scripts",
+]

-devices.device, devices.device_interrogate, devices.device_gfpgan, devices.device_swinir, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
-(devices.cpu if any(y in cmd_opts.use_cpu for y in [x, 'all']) else devices.get_optimal_device() for x in ['sd', 'interrogate', 'gfpgan', 'swinir', 'esrgan', 'scunet', 'codeformer'])
+cmd_opts.disable_extension_access = (cmd_opts.share or cmd_opts.listen or cmd_opts.server_name) and not cmd_opts.enable_insecure_extension_access
+
+devices.device, devices.device_interrogate, devices.device_gfpgan, devices.device_esrgan, devices.device_codeformer = \
+    (devices.cpu if any(y in cmd_opts.use_cpu for y in [x, 'all']) else devices.get_optimal_device() for x in ['sd', 'interrogate', 'gfpgan', 'esrgan', 'codeformer'])

 device = devices.device
 weight_load_location = None if cmd_opts.lowram else "cpu"
@ -118,10 +135,12 @@ xformers_available = False
 config_filename = cmd_opts.ui_settings_file

 os.makedirs(cmd_opts.hypernetwork_dir, exist_ok=True)
-hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
+hypernetworks = {}
 loaded_hypernetwork = None

+
 def reload_hypernetworks():
+    from modules.hypernetworks import hypernetwork
    global hypernetworks

    hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
@ -161,9 +180,10 @@ class State:
    def dict(self):
        obj = {
            "skipped": self.skipped,
-            "interrupted": self.skipped,
+            "interrupted": self.interrupted,
            "job": self.job,
            "job_count": self.job_count,
+            "job_timestamp": self.job_timestamp,
            "job_no": self.job_no,
            "sampling_step": self.sampling_step,
            "sampling_steps": self.sampling_steps,
@ -194,22 +214,25 @@ class State:

    """sets self.current_image from self.current_latent if enough sampling steps have been made after the last call to this"""
    def set_current_image(self):
+        if not parallel_processing_allowed:
+            return
+
        if self.sampling_step - self.current_image_sampling_step >= opts.show_progress_every_n_steps and opts.show_progress_every_n_steps > 0:
            self.do_set_current_image()

    def do_set_current_image(self):
-        if not parallel_processing_allowed:
-            return
        if self.current_latent is None:
            return

+        import modules.sd_samplers
        if opts.show_progress_grid:
-            self.current_image = sd_samplers.samples_to_image_grid(self.current_latent)
+            self.current_image = modules.sd_samplers.samples_to_image_grid(self.current_latent)
        else:
-            self.current_image = sd_samplers.sample_to_image(self.current_latent)
+            self.current_image = modules.sd_samplers.sample_to_image(self.current_latent)

        self.current_image_sampling_step = self.sampling_step

+
 state = State()

 artist_db = modules.artists.ArtistsDatabase(os.path.join(script_path, 'artists.csv'))
@ -245,6 +268,21 @@ def options_section(section_identifier, options_dict):
    return options_dict


+def list_checkpoint_tiles():
+    import modules.sd_models
+    return modules.sd_models.checkpoint_tiles()
+
+
+def refresh_checkpoints():
+    import modules.sd_models
+    return modules.sd_models.list_models()
+
+
+def list_samplers():
+    import modules.sd_samplers
+    return modules.sd_samplers.all_samplers
+
+
 hide_dirs = {"visible": not cmd_opts.hide_ui_dir_config}

 options_templates = {}
@ -271,8 +309,13 @@ options_templates.update(options_section(('saving-images', "Saving images/grids"
    "export_for_4chan": OptionInfo(True, "If PNG image is larger than 4MB or any dimension is larger than 4000, downscale and save copy as JPG"),

    "use_original_name_batch": OptionInfo(False, "Use original name for output filename during batch process in extras tab"),
+    "use_upscaler_name_as_suffix": OptionInfo(False, "Use upscaler name as filename suffix in the extras tab"),
    "save_selected_only": OptionInfo(True, "When using 'Save' button, only save a single selected image"),
    "do_not_add_watermark": OptionInfo(False, "Do not add watermark to images"),
+
+    "temp_dir":  OptionInfo("", "Directory for temporary images; leave empty for default"),
+    "clean_temp_dir_at_start": OptionInfo(False, "Cleanup non-default temporary directory when starting webui"),
+
 }))

 options_templates.update(options_section(('saving-paths', "Paths for saving"), {
@ -297,12 +340,8 @@ options_templates.update(options_section(('saving-to-dirs', "Saving to a directo
 options_templates.update(options_section(('upscaling', "Upscaling"), {
    "ESRGAN_tile": OptionInfo(192, "Tile size for ESRGAN upscalers. 0 = no tiling.", gr.Slider, {"minimum": 0, "maximum": 512, "step": 16}),
    "ESRGAN_tile_overlap": OptionInfo(8, "Tile overlap, in pixels for ESRGAN upscalers. Low values = visible seam.", gr.Slider, {"minimum": 0, "maximum": 48, "step": 1}),
-    "realesrgan_enabled_models": OptionInfo(["R-ESRGAN x4+", "R-ESRGAN x4+ Anime6B"], "Select which Real-ESRGAN models to show in the web UI. (Requires restart)", gr.CheckboxGroup, lambda: {"choices": realesrgan_models_names()}),
-    "SWIN_tile": OptionInfo(192, "Tile size for all SwinIR.", gr.Slider, {"minimum": 16, "maximum": 512, "step": 16}),
-    "SWIN_tile_overlap": OptionInfo(8, "Tile overlap, in pixels for SwinIR. Low values = visible seam.", gr.Slider, {"minimum": 0, "maximum": 48, "step": 1}),
-    "ldsr_steps": OptionInfo(100, "LDSR processing steps. Lower = faster", gr.Slider, {"minimum": 1, "maximum": 200, "step": 1}),
+    "realesrgan_enabled_models": OptionInfo(["R-ESRGAN 4x+", "R-ESRGAN 4x+ Anime6B"], "Select which Real-ESRGAN models to show in the web UI. (Requires restart)", gr.CheckboxGroup, lambda: {"choices": realesrgan_models_names()}),
    "upscaler_for_img2img": OptionInfo(None, "Upscaler for img2img", gr.Dropdown, lambda: {"choices": [x.name for x in sd_upscalers]}),
-    "use_scale_latent_for_hires_fix": OptionInfo(False, "Upscale latent space image when doing hires. fix"),
 }))

 options_templates.update(options_section(('face-restoration', "Face restoration"), {
@ -319,7 +358,8 @@ options_templates.update(options_section(('system', "System"), {

 options_templates.update(options_section(('training', "Training"), {
    "unload_models_when_training": OptionInfo(False, "Move VAE and CLIP to RAM when training if possible. Saves VRAM."),
-    "save_optimizer_state": OptionInfo(False, "Saves Optimizer state as separate *.optim file. Training can be resumed with HN itself and matching optim file."),
+    "pin_memory": OptionInfo(False, "Turn on pin_memory for DataLoader. Makes training slightly faster but can increase memory usage."),
+    "save_optimizer_state": OptionInfo(False, "Saves Optimizer state as separate *.optim file. Training of embedding or HN can be resumed with the matching optim file."),
    "dataset_filename_word_regex": OptionInfo("", "Filename word regex"),
    "dataset_filename_join_string": OptionInfo(" ", "Filename join string"),
    "training_image_repeats_per_epoch": OptionInfo(1, "Number of repeats for a single input image per epoch; used only for displaying epoch number", gr.Number, {"precision": 0}),
@ -328,24 +368,31 @@ options_templates.update(options_section(('training', "Training"), {
 }))

 options_templates.update(options_section(('sd', "Stable Diffusion"), {
-    "sd_model_checkpoint": OptionInfo(None, "Stable Diffusion checkpoint", gr.Dropdown, lambda: {"choices": modules.sd_models.checkpoint_tiles()}, refresh=sd_models.list_models),
+    "sd_model_checkpoint": OptionInfo(None, "Stable Diffusion checkpoint", gr.Dropdown, lambda: {"choices": list_checkpoint_tiles()}, refresh=refresh_checkpoints),
    "sd_checkpoint_cache": OptionInfo(0, "Checkpoints to cache in RAM", gr.Slider, {"minimum": 0, "maximum": 10, "step": 1}),
-    "sd_vae": OptionInfo("auto", "SD VAE", gr.Dropdown, lambda: {"choices": list(sd_vae.vae_list)}, refresh=sd_vae.refresh_vae_list),
+    "sd_vae_checkpoint_cache": OptionInfo(0, "VAE Checkpoints to cache in RAM", gr.Slider, {"minimum": 0, "maximum": 10, "step": 1}),
+    "sd_vae": OptionInfo("auto", "SD VAE", gr.Dropdown, lambda: {"choices": sd_vae.vae_list}, refresh=sd_vae.refresh_vae_list),
+    "sd_vae_as_default": OptionInfo(False, "Ignore selected VAE for stable diffusion checkpoints that have their own .vae.pt next to them"),
    "sd_hypernetwork": OptionInfo("None", "Hypernetwork", gr.Dropdown, lambda: {"choices": ["None"] + [x for x in hypernetworks.keys()]}, refresh=reload_hypernetworks),
    "sd_hypernetwork_strength": OptionInfo(1.0, "Hypernetwork strength", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.001}),
    "inpainting_mask_weight": OptionInfo(1.0, "Inpainting conditioning mask strength", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
+    "initial_noise_multiplier": OptionInfo(1.0, "Noise multiplier for img2img", gr.Slider, {"minimum": 0.5, "maximum": 1.5, "step": 0.01 }),
    "img2img_color_correction": OptionInfo(False, "Apply color correction to img2img results to match original colors."),
    "img2img_fix_steps": OptionInfo(False, "With img2img, do exactly the amount of steps the slider specifies (normally you'd do less with less denoising)."),
+    "img2img_background_color": OptionInfo("#ffffff", "With img2img, fill image's transparent parts with this color.", gr.ColorPicker, {}),
    "enable_quantization": OptionInfo(False, "Enable quantization in K samplers for sharper and cleaner results. This may change existing seeds. Requires restart to apply."),
    "enable_emphasis": OptionInfo(True, "Emphasis: use (text) to make model pay more attention to text and [text] to make it pay less attention"),
-    "use_old_emphasis_implementation": OptionInfo(False, "Use old emphasis implementation. Can be useful to reproduce old seeds."),
    "enable_batch_seeds": OptionInfo(True, "Make K-diffusion samplers produce same images in a batch as when making a single image"),
    "comma_padding_backtrack": OptionInfo(20, "Increase coherency by padding from the last comma within n tokens when using more than 75 tokens", gr.Slider, {"minimum": 0, "maximum": 74, "step": 1 }),
-    "filter_nsfw": OptionInfo(False, "Filter NSFW content"),
-    'CLIP_stop_at_last_layers': OptionInfo(1, "Stop At last layers of CLIP model", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}),
+    'CLIP_stop_at_last_layers': OptionInfo(1, "Clip skip", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}),
    "random_artist_categories": OptionInfo([], "Allowed categories for random artists selection when using the Roll button", gr.CheckboxGroup, {"choices": artist_db.categories()}),
 }))

+options_templates.update(options_section(('compatibility', "Compatibility"), {
+    "use_old_emphasis_implementation": OptionInfo(False, "Use old emphasis implementation. Can be useful to reproduce old seeds."),
+    "use_old_karras_scheduler_sigmas": OptionInfo(False, "Use old karras scheduler sigmas (0.1 to 10)."),
+}))
+
 options_templates.update(options_section(('interrogate', "Interrogate Options"), {
    "interrogate_keep_models_in_memory": OptionInfo(False, "Interrogate: keep models in VRAM"),
    "interrogate_use_builtin_artists": OptionInfo(True, "Interrogate: use artists from artists.csv"),
@ -358,11 +405,13 @@ options_templates.update(options_section(('interrogate', "Interrogate Options"),
    "deepbooru_sort_alpha": OptionInfo(True, "Interrogate: deepbooru sort alphabetically"),
    "deepbooru_use_spaces": OptionInfo(False, "use spaces for tags in deepbooru"),
    "deepbooru_escape": OptionInfo(True, "escape (\\) brackets in deepbooru (so they are used as literal brackets and not for emphasis)"),
+    "deepbooru_filter_tags": OptionInfo("", "filter out those tags from deepbooru output (separated by comma)"),
 }))

 options_templates.update(options_section(('ui', "User interface"), {
    "show_progressbar": OptionInfo(True, "Show progressbar"),
    "show_progress_every_n_steps": OptionInfo(0, "Show image creation progress every N sampling steps. Set to 0 to disable. Set to -1 to show after completion of batch.", gr.Slider, {"minimum": -1, "maximum": 32, "step": 1}),
+    "show_progress_type": OptionInfo("Full", "Image creation progress preview mode", gr.Radio, {"choices": ["Full", "Approx NN", "Approx cheap"]}),
    "show_progress_grid": OptionInfo(True, "Show previews of all images generated in a batch as a grid"),
    "return_grid": OptionInfo(True, "Show grid in results for web"),
    "do_not_show_images": OptionInfo(False, "Do not show any images in results for web"),
@ -370,16 +419,20 @@ options_templates.update(options_section(('ui', "User interface"), {
    "add_model_name_to_info": OptionInfo(False, "Add model name to generation information"),
    "disable_weights_auto_swap": OptionInfo(False, "When reading generation parameters from text into UI (from PNG info or pasted text), do not change the selected model/checkpoint."),
    "send_seed": OptionInfo(True, "Send seed when sending prompt or image to other interface"),
+    "send_size": OptionInfo(True, "Send size when sending prompt or image to another interface"),
    "font": OptionInfo("", "Font for image grids that have text"),
    "js_modal_lightbox": OptionInfo(True, "Enable full page image viewer"),
    "js_modal_lightbox_initially_zoomed": OptionInfo(True, "Show images zoomed in by default in full page image viewer"),
    "show_progress_in_title": OptionInfo(True, "Show generation progress in window title."),
+    "samplers_in_dropdown": OptionInfo(True, "Use dropdown for sampler selection instead of radio group"),
+    "dimensions_and_batch_together": OptionInfo(True, "Show Witdth/Height and Batch sliders in same row"),
    'quicksettings': OptionInfo("sd_model_checkpoint", "Quicksettings list"),
+    'ui_reorder': OptionInfo(", ".join(ui_reorder_categories), "txt2img/ing2img UI item order"),
    'localization': OptionInfo("None", "Localization (requires restart)", gr.Dropdown, lambda: {"choices": ["None"] + list(localization.localizations.keys())}, refresh=lambda: localization.list_localizations(cmd_opts.localizations_dir)),
 }))

 options_templates.update(options_section(('sampler-params', "Sampler parameters"), {
-    "hide_samplers": OptionInfo([], "Hide samplers in user interface (requires restart)", gr.CheckboxGroup, lambda: {"choices": [x.name for x in sd_samplers.all_samplers]}),
+    "hide_samplers": OptionInfo([], "Hide samplers in user interface (requires restart)", gr.CheckboxGroup, lambda: {"choices": [x.name for x in list_samplers()]}),
    "eta_ddim": OptionInfo(0.0, "eta (noise multiplier) for DDIM", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
    "eta_ancestral": OptionInfo(1.0, "eta (noise multiplier) for ancestral samplers", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
    "ddim_discretize": OptionInfo('uniform', "img2img DDIM discretize", gr.Radio, {"choices": ['uniform', 'quad']}),
@ -432,6 +485,28 @@ class Options:

        return super(Options, self).__getattribute__(item)

+    def set(self, key, value):
+        """sets an option and calls its onchange callback, returning True if the option changed and False otherwise"""
+
+        oldval = self.data.get(key, None)
+        if oldval == value:
+            return False
+
+        try:
+            setattr(self, key, value)
+        except RuntimeError:
+            return False
+
+        if self.data_labels[key].onchange is not None:
+            try:
+                self.data_labels[key].onchange()
+            except Exception as e:
+                errors.display(e, f"changing setting {key} to {value}")
+                setattr(self, key, oldval)
+                return False
+
+        return True
+
    def save(self, filename):
        assert not cmd_opts.freeze_settings, "saving settings is disabled"

@ -491,6 +566,15 @@ opts = Options()
 if os.path.exists(config_filename):
    opts.load(config_filename)

+latent_upscale_default_mode = "Latent"
+latent_upscale_modes = {
+    "Latent": {"mode": "bilinear", "antialias": False},
+    "Latent (antialiased)": {"mode": "bilinear", "antialias": True},
+    "Latent (bicubic)": {"mode": "bicubic", "antialias": False},
+    "Latent (bicubic antialiased)": {"mode": "bicubic", "antialias": True},
+    "Latent (nearest)": {"mode": "nearest", "antialias": False},
+}
+
 sd_upscalers = []

 sd_model = None
--- a/modules/styles.py
+++ b/modules/styles.py
@ -65,17 +65,6 @@ class StyleDatabase:
    def apply_negative_styles_to_prompt(self, prompt, styles):
        return apply_styles_to_prompt(prompt, [self.styles.get(x, self.no_style).negative_prompt for x in styles])

-    def apply_styles(self, p: StableDiffusionProcessing) -> None:
-        if isinstance(p.prompt, list):
-            p.prompt = [self.apply_styles_to_prompt(prompt, p.styles) for prompt in p.prompt]
-        else:
-            p.prompt = self.apply_styles_to_prompt(p.prompt, p.styles)
-
-        if isinstance(p.negative_prompt, list):
-            p.negative_prompt = [self.apply_negative_styles_to_prompt(prompt, p.styles) for prompt in p.negative_prompt]
-        else:
-            p.negative_prompt = self.apply_negative_styles_to_prompt(p.negative_prompt, p.styles)
-
    def save_styles(self, path: str) -> None:
        # Write to temporary file first, so we don't nuke the file if something goes wrong
        fd, temp_path = tempfile.mkstemp(".csv")
--- a/modules/swinir_model.py
+++ b/modules/swinir_model.py
@ -1,161 +0,0 @@
-import contextlib
-import os
-
-import numpy as np
-import torch
-from PIL import Image
-from basicsr.utils.download_util import load_file_from_url
-from tqdm import tqdm
-
-from modules import modelloader, devices
-from modules.shared import cmd_opts, opts
-from modules.swinir_model_arch import SwinIR as net
-from modules.swinir_model_arch_v2 import Swin2SR as net2
-from modules.upscaler import Upscaler, UpscalerData
-
-precision_scope = (
-    torch.autocast if cmd_opts.precision == "autocast" else contextlib.nullcontext
-)
-
-
-class UpscalerSwinIR(Upscaler):
-    def __init__(self, dirname):
-        self.name = "SwinIR"
-        self.model_url = "https://github.com/JingyunLiang/SwinIR/releases/download/v0.0" \
-                         "/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR" \
-                         "-L_x4_GAN.pth "
-        self.model_name = "SwinIR 4x"
-        self.user_path = dirname
-        super().__init__()
-        scalers = []
-        model_files = self.find_models(ext_filter=[".pt", ".pth"])
-        for model in model_files:
-            if "http" in model:
-                name = self.model_name
-            else:
-                name = modelloader.friendly_name(model)
-            model_data = UpscalerData(name, model, self)
-            scalers.append(model_data)
-        self.scalers = scalers
-
-    def do_upscale(self, img, model_file):
-        model = self.load_model(model_file)
-        if model is None:
-            return img
-        model = model.to(devices.device_swinir)
-        img = upscale(img, model)
-        try:
-            torch.cuda.empty_cache()
-        except:
-            pass
-        return img
-
-    def load_model(self, path, scale=4):
-        if "http" in path:
-            dl_name = "%s%s" % (self.model_name.replace(" ", "_"), ".pth")
-            filename = load_file_from_url(url=path, model_dir=self.model_path, file_name=dl_name, progress=True)
-        else:
-            filename = path
-        if filename is None or not os.path.exists(filename):
-            return None
-        if filename.endswith(".v2.pth"):
-            model = net2(
-            upscale=scale,
-            in_chans=3,
-            img_size=64,
-            window_size=8,
-            img_range=1.0,
-            depths=[6, 6, 6, 6, 6, 6],
-            embed_dim=180,
-            num_heads=[6, 6, 6, 6, 6, 6],
-            mlp_ratio=2,
-            upsampler="nearest+conv",
-            resi_connection="1conv",
-            )
-            params = None
-        else:
-            model = net(
-                upscale=scale,
-                in_chans=3,
-                img_size=64,
-                window_size=8,
-                img_range=1.0,
-                depths=[6, 6, 6, 6, 6, 6, 6, 6, 6],
-                embed_dim=240,
-                num_heads=[8, 8, 8, 8, 8, 8, 8, 8, 8],
-                mlp_ratio=2,
-                upsampler="nearest+conv",
-                resi_connection="3conv",
-            )
-            params = "params_ema"
-
-        pretrained_model = torch.load(filename)
-        if params is not None:
-            model.load_state_dict(pretrained_model[params], strict=True)
-        else:
-            model.load_state_dict(pretrained_model, strict=True)
-        if not cmd_opts.no_half:
-            model = model.half()
-        return model
-
-
-def upscale(
-        img,
-        model,
-        tile=opts.SWIN_tile,
-        tile_overlap=opts.SWIN_tile_overlap,
-        window_size=8,
-        scale=4,
-):
-    img = np.array(img)
-    img = img[:, :, ::-1]
-    img = np.moveaxis(img, 2, 0) / 255
-    img = torch.from_numpy(img).float()
-    img = devices.mps_contiguous_to(img.unsqueeze(0), devices.device_swinir)
-    with torch.no_grad(), precision_scope("cuda"):
-        _, _, h_old, w_old = img.size()
-        h_pad = (h_old // window_size + 1) * window_size - h_old
-        w_pad = (w_old // window_size + 1) * window_size - w_old
-        img = torch.cat([img, torch.flip(img, [2])], 2)[:, :, : h_old + h_pad, :]
-        img = torch.cat([img, torch.flip(img, [3])], 3)[:, :, :, : w_old + w_pad]
-        output = inference(img, model, tile, tile_overlap, window_size, scale)
-        output = output[..., : h_old * scale, : w_old * scale]
-        output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
-        if output.ndim == 3:
-            output = np.transpose(
-                output[[2, 1, 0], :, :], (1, 2, 0)
-            )  # CHW-RGB to HCW-BGR
-        output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
-        return Image.fromarray(output, "RGB")
-
-
-def inference(img, model, tile, tile_overlap, window_size, scale):
-    # test the image tile by tile
-    b, c, h, w = img.size()
-    tile = min(tile, h, w)
-    assert tile % window_size == 0, "tile size should be a multiple of window_size"
-    sf = scale
-
-    stride = tile - tile_overlap
-    h_idx_list = list(range(0, h - tile, stride)) + [h - tile]
-    w_idx_list = list(range(0, w - tile, stride)) + [w - tile]
-    E = torch.zeros(b, c, h * sf, w * sf, dtype=torch.half, device=devices.device_swinir).type_as(img)
-    W = torch.zeros_like(E, dtype=torch.half, device=devices.device_swinir)
-
-    with tqdm(total=len(h_idx_list) * len(w_idx_list), desc="SwinIR tiles") as pbar:
-        for h_idx in h_idx_list:
-            for w_idx in w_idx_list:
-                in_patch = img[..., h_idx: h_idx + tile, w_idx: w_idx + tile]
-                out_patch = model(in_patch)
-                out_patch_mask = torch.ones_like(out_patch)
-
-                E[
-                ..., h_idx * sf: (h_idx + tile) * sf, w_idx * sf: (w_idx + tile) * sf
-                ].add_(out_patch)
-                W[
-                ..., h_idx * sf: (h_idx + tile) * sf, w_idx * sf: (w_idx + tile) * sf
-                ].add_(out_patch_mask)
-                pbar.update(1)
-    output = E.div_(W)
-
-    return output
--- a/modules/swinir_model_arch.py
+++ b/modules/swinir_model_arch.py
@ -1,867 +0,0 @@
-# -----------------------------------------------------------------------------------
-# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
-# Originally Written by Ze Liu, Modified by Jingyun Liang.
-# -----------------------------------------------------------------------------------
-
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        if self.shift_size > 0:
-            attn_mask = self.calculate_mask(self.input_resolution)
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def calculate_mask(self, x_size):
-        # calculate attention mask for SW-MSA
-        H, W = x_size
-        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        return attn_mask
-
-    def forward(self, x, x_size):
-        H, W = x_size
-        B, L, C = x.shape
-        # assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
-        if self.input_resolution == x_size:
-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-        else:
-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x, x_size):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x, x_size)
-            else:
-                x = blk(x, x_size)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class RSTB(nn.Module):
-    """Residual Swin Transformer Block (RSTB).
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        img_size: Input image size.
-        patch_size: Patch size.
-        resi_connection: The convolutional block before residual connection.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
-                 img_size=224, patch_size=4, resi_connection='1conv'):
-        super(RSTB, self).__init__()
-
-        self.dim = dim
-        self.input_resolution = input_resolution
-
-        self.residual_group = BasicLayer(dim=dim,
-                                         input_resolution=input_resolution,
-                                         depth=depth,
-                                         num_heads=num_heads,
-                                         window_size=window_size,
-                                         mlp_ratio=mlp_ratio,
-                                         qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                         drop=drop, attn_drop=attn_drop,
-                                         drop_path=drop_path,
-                                         norm_layer=norm_layer,
-                                         downsample=downsample,
-                                         use_checkpoint=use_checkpoint)
-
-        if resi_connection == '1conv':
-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
-                                      nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim, 3, 1, 1))
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
-            norm_layer=None)
-
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
-            norm_layer=None)
-
-    def forward(self, x, x_size):
-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
-
-    def flops(self):
-        flops = 0
-        flops += self.residual_group.flops()
-        H, W = self.input_resolution
-        flops += H * W * self.dim * self.dim * 9
-        flops += self.patch_embed.flops()
-        flops += self.patch_unembed.flops()
-
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        flops = 0
-        H, W = self.img_size
-        if self.norm is not None:
-            flops += H * W * self.embed_dim
-        return flops
-
-
-class PatchUnEmbed(nn.Module):
-    r""" Image to Patch Unembedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-    def forward(self, x, x_size):
-        B, HW, C = x.shape
-        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C
-        return x
-
-    def flops(self):
-        flops = 0
-        return flops
-
-
-class Upsample(nn.Sequential):
-    """Upsample module.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-
-
-class UpsampleOneStep(nn.Sequential):
-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
-       Used in lightweight SR to save parameters.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-
-    """
-
-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
-        self.num_feat = num_feat
-        self.input_resolution = input_resolution
-        m = []
-        m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))
-        m.append(nn.PixelShuffle(scale))
-        super(UpsampleOneStep, self).__init__(*m)
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.num_feat * 3 * 9
-        return flops
-
-
-class SwinIR(nn.Module):
-    r""" SwinIR
-        A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 64
-        patch_size (int | tuple(int)): Patch size. Default: 1
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
-        img_range: Image range. 1. or 255.
-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
-    """
-
-    def __init__(self, img_size=64, patch_size=1, in_chans=3,
-                 embed_dim=96, depths=[6, 6, 6, 6], num_heads=[6, 6, 6, 6],
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
-                 use_checkpoint=False, upscale=2, img_range=1., upsampler='', resi_connection='1conv',
-                 **kwargs):
-        super(SwinIR, self).__init__()
-        num_in_ch = in_chans
-        num_out_ch = in_chans
-        num_feat = 64
-        self.img_range = img_range
-        if in_chans == 3:
-            rgb_mean = (0.4488, 0.4371, 0.4040)
-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        else:
-            self.mean = torch.zeros(1, 1, 1, 1)
-        self.upscale = upscale
-        self.upsampler = upsampler
-        self.window_size = window_size
-
-        #####################################################################################################
-        ################################### 1, shallow feature extraction ###################################
-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
-
-        #####################################################################################################
-        ################################### 2, deep feature extraction ######################################
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = embed_dim
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # merge non-overlapping patches into image
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build Residual Swin Transformer blocks (RSTB)
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = RSTB(dim=embed_dim,
-                         input_resolution=(patches_resolution[0],
-                                           patches_resolution[1]),
-                         depth=depths[i_layer],
-                         num_heads=num_heads[i_layer],
-                         window_size=window_size,
-                         mlp_ratio=self.mlp_ratio,
-                         qkv_bias=qkv_bias, qk_scale=qk_scale,
-                         drop=drop_rate, attn_drop=attn_drop_rate,
-                         drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
-                         norm_layer=norm_layer,
-                         downsample=None,
-                         use_checkpoint=use_checkpoint,
-                         img_size=img_size,
-                         patch_size=patch_size,
-                         resi_connection=resi_connection
-
-                         )
-            self.layers.append(layer)
-        self.norm = norm_layer(self.num_features)
-
-        # build the last conv layer in deep feature extraction
-        if resi_connection == '1conv':
-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv_after_body = nn.Sequential(nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
-
-        #####################################################################################################
-        ################################ 3, high quality image reconstruction ################################
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR (to save parameters)
-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
-                                            (patches_resolution[0], patches_resolution[1]))
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR (less artifacts)
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            if self.upscale == 4:
-                self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def check_image_size(self, x):
-        _, _, h, w = x.size()
-        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
-        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
-        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
-        return x
-
-    def forward_features(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x, x_size)
-
-        x = self.norm(x)  # B L C
-        x = self.patch_unembed(x, x_size)
-
-        return x
-
-    def forward(self, x):
-        H, W = x.shape[2:]
-        x = self.check_image_size(x)
-        
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.conv_last(self.upsample(x))
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.upsample(x)
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            if self.upscale == 4:
-                x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.conv_last(self.lrelu(self.conv_hr(x)))
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            x_first = self.conv_first(x)
-            res = self.conv_after_body(self.forward_features(x_first)) + x_first
-            x = x + self.conv_last(res)
-
-        x = x / self.img_range + self.mean
-
-        return x[:, :, :H*self.upscale, :W*self.upscale]
-
-    def flops(self):
-        flops = 0
-        H, W = self.patches_resolution
-        flops += H * W * 3 * self.embed_dim * 9
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-        flops += H * W * 3 * self.embed_dim * self.embed_dim
-        flops += self.upsample.flops()
-        return flops
-
-
-if __name__ == '__main__':
-    upscale = 4
-    window_size = 8
-    height = (1024 // upscale // window_size + 1) * window_size
-    width = (720 // upscale // window_size + 1) * window_size
-    model = SwinIR(upscale=2, img_size=(height, width),
-                   window_size=window_size, img_range=1., depths=[6, 6, 6, 6],
-                   embed_dim=60, num_heads=[6, 6, 6, 6], mlp_ratio=2, upsampler='pixelshuffledirect')
-    print(model)
-    print(height, width, model.flops() / 1e9)
-
-    x = torch.randn((1, 3, height, width))
-    x = model(x)
-    print(x.shape)
--- a/modules/swinir_model_arch_v2.py
+++ b/modules/swinir_model_arch_v2.py
--- a/modules/textual_inversion/autocrop.py
+++ b/modules/textual_inversion/autocrop.py
@ -276,8 +276,8 @@ def poi_average(pois, settings):
        weight += poi.weight
        x += poi.x * poi.weight
        y += poi.y * poi.weight
-    avg_x = round(x / weight)
-    avg_y = round(y / weight)
+    avg_x = round(weight and x / weight)
+    avg_y = round(weight and y / weight)

    return PointOfInterest(avg_x, avg_y)

@ -338,4 +338,4 @@ class Settings:
    self.face_points_weight = face_points_weight
    self.annotate_image = annotate_image
    self.destop_view_image = False
-    self.dnn_model_path = dnn_model_path
+    self.dnn_model_path = dnn_model_path
--- a/modules/textual_inversion/dataset.py
+++ b/modules/textual_inversion/dataset.py
@ -3,7 +3,7 @@ import numpy as np
 import PIL
 import torch
 from PIL import Image
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, DataLoader
 from torchvision import transforms

 import random
@ -11,25 +11,28 @@ import tqdm
 from modules import devices, shared
 import re

+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+
 re_numbers_at_start = re.compile(r"^[-\d]+\s*")


 class DatasetEntry:
-    def __init__(self, filename=None, latent=None, filename_text=None):
+    def __init__(self, filename=None, filename_text=None, latent_dist=None, latent_sample=None, cond=None, cond_text=None, pixel_values=None):
        self.filename = filename
-        self.latent = latent
        self.filename_text = filename_text
-        self.cond = None
-        self.cond_text = None
+        self.latent_dist = latent_dist
+        self.latent_sample = latent_sample
+        self.cond = cond
+        self.cond_text = cond_text
+        self.pixel_values = pixel_values


 class PersonalizedBase(Dataset):
-    def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_token="*", model=None, device=None, template_file=None, include_cond=False, batch_size=1):
+    def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_token="*", model=None, cond_model=None, device=None, template_file=None, include_cond=False, batch_size=1, gradient_step=1, shuffle_tags=False, tag_drop_out=0, latent_sampling_method='once'):
        re_word = re.compile(shared.opts.dataset_filename_word_regex) if len(shared.opts.dataset_filename_word_regex) > 0 else None

        self.placeholder_token = placeholder_token

-        self.batch_size = batch_size
        self.width = width
        self.height = height
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
@ -45,11 +48,16 @@ class PersonalizedBase(Dataset):
        assert os.path.isdir(data_root), "Dataset directory doesn't exist"
        assert os.listdir(data_root), "Dataset directory is empty"

-        cond_model = shared.sd_model.cond_stage_model
-
        self.image_paths = [os.path.join(data_root, file_path) for file_path in os.listdir(data_root)]
+
+
+        self.shuffle_tags = shuffle_tags
+        self.tag_drop_out = tag_drop_out
+
        print("Preparing dataset...")
        for path in tqdm.tqdm(self.image_paths):
+            if shared.state.interrupted:
+                raise Exception("interrupted")
            try:
                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.BICUBIC)
            except Exception:
@ -71,53 +79,94 @@ class PersonalizedBase(Dataset):
            npimage = np.array(image).astype(np.uint8)
            npimage = (npimage / 127.5 - 1.0).astype(np.float32)

-            torchdata = torch.from_numpy(npimage).to(device=device, dtype=torch.float32)
-            torchdata = torch.moveaxis(torchdata, 2, 0)
+            torchdata = torch.from_numpy(npimage).permute(2, 0, 1).to(device=device, dtype=torch.float32)
+            latent_sample = None

-            init_latent = model.get_first_stage_encoding(model.encode_first_stage(torchdata.unsqueeze(dim=0))).squeeze()
-            init_latent = init_latent.to(devices.cpu)
+            with devices.autocast():
+                latent_dist = model.encode_first_stage(torchdata.unsqueeze(dim=0))

-            entry = DatasetEntry(filename=path, filename_text=filename_text, latent=init_latent)
+            if latent_sampling_method == "once" or (latent_sampling_method == "deterministic" and not isinstance(latent_dist, DiagonalGaussianDistribution)):
+                latent_sample = model.get_first_stage_encoding(latent_dist).squeeze().to(devices.cpu)
+                latent_sampling_method = "once"
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_sample=latent_sample)
+            elif latent_sampling_method == "deterministic":
+                # Works only for DiagonalGaussianDistribution
+                latent_dist.std = 0
+                latent_sample = model.get_first_stage_encoding(latent_dist).squeeze().to(devices.cpu)
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_sample=latent_sample)
+            elif latent_sampling_method == "random":
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_dist=latent_dist)

-            if include_cond:
+            if not (self.tag_drop_out != 0 or self.shuffle_tags):
                entry.cond_text = self.create_text(filename_text)
-                entry.cond = cond_model([entry.cond_text]).to(devices.cpu).squeeze(0)
+
+            if include_cond and not (self.tag_drop_out != 0 or self.shuffle_tags):
+                with devices.autocast():
+                    entry.cond = cond_model([entry.cond_text]).to(devices.cpu).squeeze(0)

            self.dataset.append(entry)
+            del torchdata
+            del latent_dist
+            del latent_sample

-        assert len(self.dataset) > 0, "No images have been found in the dataset."
-        self.length = len(self.dataset) * repeats // batch_size
-
-        self.dataset_length = len(self.dataset)
-        self.indexes = None
-        self.shuffle()
-
-    def shuffle(self):
-        self.indexes = np.random.permutation(self.dataset_length)
+        self.length = len(self.dataset)
+        assert self.length > 0, "No images have been found in the dataset."
+        self.batch_size = min(batch_size, self.length)
+        self.gradient_step = min(gradient_step, self.length // self.batch_size)
+        self.latent_sampling_method = latent_sampling_method

    def create_text(self, filename_text):
        text = random.choice(self.lines)
+        tags = filename_text.split(',')
+        if self.tag_drop_out != 0:
+            tags = [t for t in tags if random.random() > self.tag_drop_out]
+        if self.shuffle_tags:
+            random.shuffle(tags)
+        text = text.replace("[filewords]", ','.join(tags))
        text = text.replace("[name]", self.placeholder_token)
-        text = text.replace("[filewords]", filename_text)
        return text

    def __len__(self):
        return self.length

    def __getitem__(self, i):
-        res = []
+        entry = self.dataset[i]
+        if self.tag_drop_out != 0 or self.shuffle_tags:
+            entry.cond_text = self.create_text(entry.filename_text)
+        if self.latent_sampling_method == "random":
+            entry.latent_sample = shared.sd_model.get_first_stage_encoding(entry.latent_dist).to(devices.cpu)
+        return entry

-        for j in range(self.batch_size):
-            position = i * self.batch_size + j
-            if position % len(self.indexes) == 0:
-                self.shuffle()
+class PersonalizedDataLoader(DataLoader):
+    def __init__(self, dataset, latent_sampling_method="once", batch_size=1, pin_memory=False):
+        super(PersonalizedDataLoader, self).__init__(dataset, shuffle=True, drop_last=True, batch_size=batch_size, pin_memory=pin_memory)
+        if latent_sampling_method == "random":
+            self.collate_fn = collate_wrapper_random
+        else:
+            self.collate_fn = collate_wrapper

-            index = self.indexes[position % len(self.indexes)]
-            entry = self.dataset[index]

-            if entry.cond is None:
-                entry.cond_text = self.create_text(entry.filename_text)
+class BatchLoader:
+    def __init__(self, data):
+        self.cond_text = [entry.cond_text for entry in data]
+        self.cond = [entry.cond for entry in data]
+        self.latent_sample = torch.stack([entry.latent_sample for entry in data]).squeeze(1)
+        #self.emb_index = [entry.emb_index for entry in data]
+        #print(self.latent_sample.device)

-            res.append(entry)
+    def pin_memory(self):
+        self.latent_sample = self.latent_sample.pin_memory()
+        return self

-        return res
+def collate_wrapper(batch):
+    return BatchLoader(batch)
+
+class BatchLoaderRandom(BatchLoader):
+    def __init__(self, data):
+        super().__init__(data)
+
+    def pin_memory(self):
+        return self
+
+def collate_wrapper_random(batch):
+    return BatchLoaderRandom(batch)
--- a/modules/textual_inversion/preprocess.py
+++ b/modules/textual_inversion/preprocess.py
@ -6,12 +6,10 @@ import sys
 import tqdm
 import time

-from modules import shared, images
+from modules import shared, images, deepbooru
 from modules.paths import models_path
 from modules.shared import opts, cmd_opts
 from modules.textual_inversion import autocrop
-if cmd_opts.deepdanbooru:
-    import modules.deepbooru as deepbooru


 def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
@ -20,9 +18,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce
            shared.interrogator.load()

        if process_caption_deepbooru:
-            db_opts = deepbooru.create_deepbooru_opts()
-            db_opts[deepbooru.OPT_INCLUDE_RANKS] = False
-            deepbooru.create_deepbooru_process(opts.interrogate_deepbooru_score_threshold, db_opts)
+            deepbooru.model.start()

        preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio, process_focal_crop, process_focal_crop_face_weight, process_focal_crop_entropy_weight, process_focal_crop_edges_weight, process_focal_crop_debug)

@ -32,7 +28,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce
            shared.interrogator.send_blip_to_ram()

        if process_caption_deepbooru:
-            deepbooru.release_process()
+            deepbooru.model.stop()


 def listfiles(dirname):
@ -58,7 +54,7 @@ def save_pic_with_caption(image, index, params: PreprocessParams, existing_capti
    if params.process_caption_deepbooru:
        if len(caption) > 0:
            caption += ", "
-        caption += deepbooru.get_tags_from_process(image)
+        caption += deepbooru.model.tag_multi(image)

    filename_part = params.src
    filename_part = os.path.splitext(filename_part)[0]
@ -128,6 +124,7 @@ def preprocess_work(process_src, process_dst, process_width, process_height, pre

    files = listfiles(src)

+    shared.state.job = "preprocess"
    shared.state.textinfo = "Preprocessing..."
    shared.state.job_count = len(files)

--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@ -10,7 +10,7 @@ import csv

 from PIL import Image, PngImagePlugin

-from modules import shared, devices, sd_hijack, processing, sd_models, images
+from modules import shared, devices, sd_hijack, processing, sd_models, images, sd_samplers
 import modules.textual_inversion.dataset
 from modules.textual_inversion.learn_schedule import LearnRateScheduler

@ -23,9 +23,12 @@ class Embedding:
        self.vec = vec
        self.name = name
        self.step = step
+        self.shape = None
+        self.vectors = 0
        self.cached_checksum = None
        self.sd_checkpoint = None
        self.sd_checkpoint_name = None
+        self.optimizer_state_dict = None

    def save(self, filename):
        embedding_data = {
@ -39,6 +42,13 @@ class Embedding:

        torch.save(embedding_data, filename)

+        if shared.opts.save_optimizer_state and self.optimizer_state_dict is not None:
+            optimizer_saved_dict = {
+                'hash': self.checksum(),
+                'optimizer_state_dict': self.optimizer_state_dict,
+            }
+            torch.save(optimizer_saved_dict, filename + '.optim')
+
    def checksum(self):
        if self.cached_checksum is not None:
            return self.cached_checksum
@ -57,14 +67,17 @@ class EmbeddingDatabase:
    def __init__(self, embeddings_dir):
        self.ids_lookup = {}
        self.word_embeddings = {}
+        self.skipped_embeddings = {}
        self.dir_mtime = None
        self.embeddings_dir = embeddings_dir
+        self.expected_shape = -1

    def register_embedding(self, embedding, model):

        self.word_embeddings[embedding.name] = embedding

-        ids = model.cond_stage_model.tokenizer([embedding.name], add_special_tokens=False)['input_ids'][0]
+        # TODO changing between clip and open clip changes tokenization, which will cause embeddings to stop working
+        ids = model.cond_stage_model.tokenize([embedding.name])[0]

        first_id = ids[0]
        if first_id not in self.ids_lookup:
@ -74,21 +87,26 @@ class EmbeddingDatabase:

        return embedding

-    def load_textual_inversion_embeddings(self):
+    def get_expected_shape(self):
+        vec = shared.sd_model.cond_stage_model.encode_embedding_init_text(",", 1)
+        return vec.shape[1]
+
+    def load_textual_inversion_embeddings(self, force_reload = False):
        mt = os.path.getmtime(self.embeddings_dir)
-        if self.dir_mtime is not None and mt <= self.dir_mtime:
+        if not force_reload and self.dir_mtime is not None and mt <= self.dir_mtime:
            return

        self.dir_mtime = mt
        self.ids_lookup.clear()
        self.word_embeddings.clear()
+        self.skipped_embeddings.clear()
+        self.expected_shape = self.get_expected_shape()

        def process_file(path, filename):
-            name = os.path.splitext(filename)[0]
+            name, ext = os.path.splitext(filename)
+            ext = ext.upper()

-            data = []
-
-            if os.path.splitext(filename.upper())[-1] in ['.PNG', '.WEBP', '.JXL', '.AVIF']:
+            if ext in ['.PNG', '.WEBP', '.JXL', '.AVIF']:
                embed_image = Image.open(path)
                if hasattr(embed_image, 'text') and 'sd-ti-embedding' in embed_image.text:
                    data = embedding_from_b64(embed_image.text['sd-ti-embedding'])
@ -96,8 +114,10 @@ class EmbeddingDatabase:
                else:
                    data = extract_image_data_embed(embed_image)
                    name = data.get('name', name)
-            else:
+            elif ext in ['.BIN', '.PT']:
                data = torch.load(path, map_location="cpu")
+            else:
+                return

            # textual inversion embeddings
            if 'string_to_param' in data:
@ -121,7 +141,13 @@ class EmbeddingDatabase:
            embedding.step = data.get('step', None)
            embedding.sd_checkpoint = data.get('sd_checkpoint', None)
            embedding.sd_checkpoint_name = data.get('sd_checkpoint_name', None)
-            self.register_embedding(embedding, shared.sd_model)
+            embedding.vectors = vec.shape[0]
+            embedding.shape = vec.shape[-1]
+
+            if self.expected_shape == -1 or self.expected_shape == embedding.shape:
+                self.register_embedding(embedding, shared.sd_model)
+            else:
+                self.skipped_embeddings[name] = embedding

        for fn in os.listdir(self.embeddings_dir):
            try:
@ -132,12 +158,13 @@ class EmbeddingDatabase:

                process_file(fullfn, fn)
            except Exception:
-                print(f"Error loading emedding {fn}:", file=sys.stderr)
+                print(f"Error loading embedding {fn}:", file=sys.stderr)
                print(traceback.format_exc(), file=sys.stderr)
                continue

-        print(f"Loaded a total of {len(self.word_embeddings)} textual inversion embeddings.")
-        print("Embeddings:", ', '.join(self.word_embeddings.keys()))
+        print(f"Textual inversion embeddings loaded({len(self.word_embeddings)}): {', '.join(self.word_embeddings.keys())}")
+        if len(self.skipped_embeddings) > 0:
+            print(f"Textual inversion embeddings skipped({len(self.skipped_embeddings)}): {', '.join(self.skipped_embeddings.keys())}")

    def find_embedding_at_position(self, tokens, offset):
        token = tokens[offset]
@ -155,13 +182,11 @@ class EmbeddingDatabase:

 def create_embedding(name, num_vectors_per_token, overwrite_old, init_text='*'):
    cond_model = shared.sd_model.cond_stage_model
-    embedding_layer = cond_model.wrapped.transformer.text_model.embeddings

    with devices.autocast():
        cond_model([""])  # will send cond model to GPU if lowvram/medvram is active

-    ids = cond_model.tokenizer(init_text, max_length=num_vectors_per_token, return_tensors="pt", add_special_tokens=False)["input_ids"]
-    embedded = embedding_layer.token_embedding.wrapped(ids.to(devices.device)).squeeze(0)
+    embedded = cond_model.encode_embedding_init_text(init_text, num_vectors_per_token)
    vec = torch.zeros((num_vectors_per_token, embedded.shape[1]), device=devices.device)

    for i in range(num_vectors_per_token):
@ -184,7 +209,7 @@ def write_loss(log_directory, filename, step, epoch_len, values):
    if shared.opts.training_write_csv_every == 0:
        return

-    if (step + 1) % shared.opts.training_write_csv_every != 0:
+    if step % shared.opts.training_write_csv_every != 0:
        return
    write_csv_header = False if os.path.exists(os.path.join(log_directory, filename)) else True

@ -194,21 +219,23 @@ def write_loss(log_directory, filename, step, epoch_len, values):
        if write_csv_header:
            csv_writer.writeheader()

-        epoch = step // epoch_len
-        epoch_step = step % epoch_len 
+        epoch = (step - 1) // epoch_len
+        epoch_step = (step - 1) % epoch_len

        csv_writer.writerow({
-            "step": step + 1,
+            "step": step,
            "epoch": epoch,
-            "epoch_step": epoch_step + 1,
+            "epoch_step": epoch_step,
            **values,
        })

-def validate_train_inputs(model_name, learn_rate, batch_size, data_root, template_file, steps, save_model_every, create_image_every, log_directory, name="embedding"):
+def validate_train_inputs(model_name, learn_rate, batch_size, gradient_step, data_root, template_file, steps, save_model_every, create_image_every, log_directory, name="embedding"):
    assert model_name, f"{name} not selected"
    assert learn_rate, "Learning rate is empty or 0"
    assert isinstance(batch_size, int), "Batch size must be integer"
    assert batch_size > 0, "Batch size must be positive"
+    assert isinstance(gradient_step, int), "Gradient accumulation step must be integer"
+    assert gradient_step > 0, "Gradient accumulation step must be positive"
    assert data_root, "Dataset directory is empty"
    assert os.path.isdir(data_root), "Dataset directory doesn't exist"
    assert os.listdir(data_root), "Dataset directory is empty"
@ -224,11 +251,12 @@ def validate_train_inputs(model_name, learn_rate, batch_size, data_root, templat
    if save_model_every or create_image_every:
        assert log_directory, "Log directory is empty"

-def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_directory, training_width, training_height, steps, clip_grad_mode, clip_grad_value, create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
+def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, steps, clip_grad_mode, clip_grad_value, shuffle_tags, tag_drop_out, latent_sampling_method, create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
    save_embedding_every = save_embedding_every or 0
    create_image_every = create_image_every or 0
-    validate_train_inputs(embedding_name, learn_rate, batch_size, data_root, template_file, steps, save_embedding_every, create_image_every, log_directory, name="embedding")
+    validate_train_inputs(embedding_name, learn_rate, batch_size, gradient_step, data_root, template_file, steps, save_embedding_every, create_image_every, log_directory, name="embedding")

+    shared.state.job = "train-embedding"
    shared.state.textinfo = "Initializing textual inversion training..."
    shared.state.job_count = steps

@ -255,19 +283,16 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
    else:
        images_embeds_dir = None

-    cond_model = shared.sd_model.cond_stage_model
-
    hijack = sd_hijack.model_hijack

    embedding = hijack.embedding_db.word_embeddings[embedding_name]
    checkpoint = sd_models.select_checkpoint()

-    ititial_step = embedding.step or 0
-    if ititial_step >= steps:
-        shared.state.textinfo = f"Model has already been trained beyond specified max steps"
+    initial_step = embedding.step or 0
+    if initial_step >= steps:
+        shared.state.textinfo = "Model has already been trained beyond specified max steps"
        return embedding, filename
-
-    scheduler = LearnRateScheduler(learn_rate, steps, ititial_step)
+    scheduler = LearnRateScheduler(learn_rate, steps, initial_step)

    clip_grad = torch.nn.utils.clip_grad_value_ if clip_grad_mode == "value" else \
        torch.nn.utils.clip_grad_norm_ if clip_grad_mode == "norm" else \
@ -276,156 +301,219 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
        clip_grad_sched = LearnRateScheduler(clip_grad_value, steps, ititial_step, verbose=False)
    # dataset loading may take a while, so input validations and early returns should be done before this
    shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
-    with torch.autocast("cuda"):
-        ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=embedding_name, model=shared.sd_model, device=devices.device, template_file=template_file, batch_size=batch_size)
+    old_parallel_processing_allowed = shared.parallel_processing_allowed
+
+    pin_memory = shared.opts.pin_memory
+
+    ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=embedding_name, model=shared.sd_model, cond_model=shared.sd_model.cond_stage_model, device=devices.device, template_file=template_file, batch_size=batch_size, gradient_step=gradient_step, shuffle_tags=shuffle_tags, tag_drop_out=tag_drop_out, latent_sampling_method=latent_sampling_method)
+
+    latent_sampling_method = ds.latent_sampling_method
+
+    dl = modules.textual_inversion.dataset.PersonalizedDataLoader(ds, latent_sampling_method=latent_sampling_method, batch_size=ds.batch_size, pin_memory=pin_memory)
+
    if unload:
+        shared.parallel_processing_allowed = False
        shared.sd_model.first_stage_model.to(devices.cpu)

    embedding.vec.requires_grad = True
-    optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate)
+    optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate, weight_decay=0.0)
+    if shared.opts.save_optimizer_state:
+        optimizer_state_dict = None
+        if os.path.exists(filename + '.optim'):
+            optimizer_saved_dict = torch.load(filename + '.optim', map_location='cpu')
+            if embedding.checksum() == optimizer_saved_dict.get('hash', None):
+                optimizer_state_dict = optimizer_saved_dict.get('optimizer_state_dict', None)
+    
+        if optimizer_state_dict is not None:
+            optimizer.load_state_dict(optimizer_state_dict)
+            print("Loaded existing optimizer from checkpoint")
+        else:
+            print("No saved optimizer exists in checkpoint")

-    losses = torch.zeros((32,))
+    scaler = torch.cuda.amp.GradScaler()
+
+    batch_size = ds.batch_size
+    gradient_step = ds.gradient_step
+    # n steps = batch_size * gradient_step * n image processed
+    steps_per_epoch = len(ds) // batch_size // gradient_step
+    max_steps_per_epoch = len(ds) // batch_size - (len(ds) // batch_size) % gradient_step
+    loss_step = 0
+    _loss_step = 0 #internal

    last_saved_file = "<none>"
    last_saved_image = "<none>"
    forced_filename = "<none>"
    embedding_yet_to_be_embedded = False

-    pbar = tqdm.tqdm(enumerate(ds), total=steps-ititial_step)
-    for i, entries in pbar:
-        embedding.step = i + ititial_step
+    is_training_inpainting_model = shared.sd_model.model.conditioning_key in {'hybrid', 'concat'}
+    img_c = None

-        scheduler.apply(optimizer, embedding.step)
-        if scheduler.finished:
-            break
+    pbar = tqdm.tqdm(total=steps - initial_step)
+    try:
+        for i in range((steps-initial_step) * gradient_step):
+            if scheduler.finished:
+                break
+            if shared.state.interrupted:
+                break
+            for j, batch in enumerate(dl):
+                # works as a drop_last=True for gradient accumulation
+                if j == max_steps_per_epoch:
+                    break
+                scheduler.apply(optimizer, embedding.step)
+                if scheduler.finished:
+                    break
+                if shared.state.interrupted:
+                    break

-        if shared.state.interrupted:
-            break
+                if clip_grad:
+                    clip_grad_sched.step(embedding.step)
+            
+                with devices.autocast():
+                    x = batch.latent_sample.to(devices.device, non_blocking=pin_memory)
+                    c = shared.sd_model.cond_stage_model(batch.cond_text)

-        if clip_grad:
-            clip_grad_sched.step(embedding.step)
+                    if is_training_inpainting_model:
+                        if img_c is None:
+                            img_c = processing.txt2img_image_conditioning(shared.sd_model, c, training_width, training_height)

-        with torch.autocast("cuda"):
-            c = cond_model([entry.cond_text for entry in entries])
-            x = torch.stack([entry.latent for entry in entries]).to(devices.device)
-            loss = shared.sd_model(x, c)[0]
-            del x
+                        cond = {"c_concat": [img_c], "c_crossattn": [c]}
+                    else:
+                        cond = c

-            losses[embedding.step % losses.shape[0]] = loss.item()
+                    loss = shared.sd_model(x, cond)[0] / gradient_step
+                    del x

-            optimizer.zero_grad()
-            loss.backward()
+                    _loss_step += loss.item()
+                scaler.scale(loss).backward()

-            if clip_grad:
-                clip_grad(embedding.vec, clip_grad_sched.learn_rate)
+                # go back until we reach gradient accumulation steps
+                if (j + 1) % gradient_step != 0:
+                    continue
+                
+                if clip_grad:
+                    clip_grad(embedding.vec, clip_grad_sched.learn_rate)

-            optimizer.step()
+                scaler.step(optimizer)
+                scaler.update()
+                embedding.step += 1
+                pbar.update()
+                optimizer.zero_grad(set_to_none=True)
+                loss_step = _loss_step
+                _loss_step = 0

-        steps_done = embedding.step + 1
+                steps_done = embedding.step + 1

-        epoch_num = embedding.step // len(ds)
-        epoch_step = embedding.step % len(ds)
+                epoch_num = embedding.step // steps_per_epoch
+                epoch_step = embedding.step % steps_per_epoch

-        pbar.set_description(f"[Epoch {epoch_num}: {epoch_step+1}/{len(ds)}]loss: {losses.mean():.7f}")
+                pbar.set_description(f"[Epoch {epoch_num}: {epoch_step+1}/{steps_per_epoch}]loss: {loss_step:.7f}")
+                if embedding_dir is not None and steps_done % save_embedding_every == 0:
+                    # Before saving, change name to match current checkpoint.
+                    embedding_name_every = f'{embedding_name}-{steps_done}'
+                    last_saved_file = os.path.join(embedding_dir, f'{embedding_name_every}.pt')
+                    save_embedding(embedding, optimizer, checkpoint, embedding_name_every, last_saved_file, remove_cached_checksum=True)
+                    embedding_yet_to_be_embedded = True

-        if embedding_dir is not None and steps_done % save_embedding_every == 0:
-            # Before saving, change name to match current checkpoint.
-            embedding_name_every = f'{embedding_name}-{steps_done}'
-            last_saved_file = os.path.join(embedding_dir, f'{embedding_name_every}.pt')
-            save_embedding(embedding, checkpoint, embedding_name_every, last_saved_file, remove_cached_checksum=True)
-            embedding_yet_to_be_embedded = True
+                write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, steps_per_epoch, {
+                    "loss": f"{loss_step:.7f}",
+                    "learn_rate": scheduler.learn_rate
+                })

-        write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, len(ds), {
-            "loss": f"{losses.mean():.7f}",
-            "learn_rate": scheduler.learn_rate
-        })
+                if images_dir is not None and steps_done % create_image_every == 0:
+                    forced_filename = f'{embedding_name}-{steps_done}'
+                    last_saved_image = os.path.join(images_dir, forced_filename)

-        if images_dir is not None and steps_done % create_image_every == 0:
-            forced_filename = f'{embedding_name}-{steps_done}'
-            last_saved_image = os.path.join(images_dir, forced_filename)
+                    shared.sd_model.first_stage_model.to(devices.device)

-            shared.sd_model.first_stage_model.to(devices.device)
+                    p = processing.StableDiffusionProcessingTxt2Img(
+                        sd_model=shared.sd_model,
+                        do_not_save_grid=True,
+                        do_not_save_samples=True,
+                        do_not_reload_embeddings=True,
+                    )

-            p = processing.StableDiffusionProcessingTxt2Img(
-                sd_model=shared.sd_model,
-                do_not_save_grid=True,
-                do_not_save_samples=True,
-                do_not_reload_embeddings=True,
-            )
+                    if preview_from_txt2img:
+                        p.prompt = preview_prompt
+                        p.negative_prompt = preview_negative_prompt
+                        p.steps = preview_steps
+                        p.sampler_name = sd_samplers.samplers[preview_sampler_index].name
+                        p.cfg_scale = preview_cfg_scale
+                        p.seed = preview_seed
+                        p.width = preview_width
+                        p.height = preview_height
+                    else:
+                        p.prompt = batch.cond_text[0]
+                        p.steps = 20
+                        p.width = training_width
+                        p.height = training_height

-            if preview_from_txt2img:
-                p.prompt = preview_prompt
-                p.negative_prompt = preview_negative_prompt
-                p.steps = preview_steps
-                p.sampler_index = preview_sampler_index
-                p.cfg_scale = preview_cfg_scale
-                p.seed = preview_seed
-                p.width = preview_width
-                p.height = preview_height
-            else:
-                p.prompt = entries[0].cond_text
-                p.steps = 20
-                p.width = training_width
-                p.height = training_height
+                    preview_text = p.prompt

-            preview_text = p.prompt
+                    processed = processing.process_images(p)
+                    image = processed.images[0] if len(processed.images) > 0 else None

-            processed = processing.process_images(p)
-            image = processed.images[0]
+                    if unload:
+                        shared.sd_model.first_stage_model.to(devices.cpu)

-            if unload:
-                shared.sd_model.first_stage_model.to(devices.cpu)
+                    if image is not None:
+                        shared.state.current_image = image
+                        last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
+                        last_saved_image += f", prompt: {preview_text}"

-            shared.state.current_image = image
+                    if save_image_with_stored_embedding and os.path.exists(last_saved_file) and embedding_yet_to_be_embedded:

-            if save_image_with_stored_embedding and os.path.exists(last_saved_file) and embedding_yet_to_be_embedded:
+                        last_saved_image_chunks = os.path.join(images_embeds_dir, f'{embedding_name}-{steps_done}.png')

-                last_saved_image_chunks = os.path.join(images_embeds_dir, f'{embedding_name}-{steps_done}.png')
+                        info = PngImagePlugin.PngInfo()
+                        data = torch.load(last_saved_file)
+                        info.add_text("sd-ti-embedding", embedding_to_b64(data))

-                info = PngImagePlugin.PngInfo()
-                data = torch.load(last_saved_file)
-                info.add_text("sd-ti-embedding", embedding_to_b64(data))
+                        title = "<{}>".format(data.get('name', '???'))

-                title = "<{}>".format(data.get('name', '???'))
+                        try:
+                            vectorSize = list(data['string_to_param'].values())[0].shape[0]
+                        except Exception as e:
+                            vectorSize = '?'

-                try:
-                    vectorSize = list(data['string_to_param'].values())[0].shape[0]
-                except Exception as e:
-                    vectorSize = '?'
+                        checkpoint = sd_models.select_checkpoint()
+                        footer_left = checkpoint.model_name
+                        footer_mid = '[{}]'.format(checkpoint.hash)
+                        footer_right = '{}v {}s'.format(vectorSize, steps_done)

-                checkpoint = sd_models.select_checkpoint()
-                footer_left = checkpoint.model_name
-                footer_mid = '[{}]'.format(checkpoint.hash)
-                footer_right = '{}v {}s'.format(vectorSize, steps_done)
+                        captioned_image = caption_image_overlay(image, title, footer_left, footer_mid, footer_right)
+                        captioned_image = insert_image_data_embed(captioned_image, data)

-                captioned_image = caption_image_overlay(image, title, footer_left, footer_mid, footer_right)
-                captioned_image = insert_image_data_embed(captioned_image, data)
+                        captioned_image.save(last_saved_image_chunks, "PNG", pnginfo=info)
+                        embedding_yet_to_be_embedded = False

-                captioned_image.save(last_saved_image_chunks, "PNG", pnginfo=info)
-                embedding_yet_to_be_embedded = False
+                    last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
+                    last_saved_image += f", prompt: {preview_text}"

-            last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
-            last_saved_image += f", prompt: {preview_text}"
+                shared.state.job_no = embedding.step

-        shared.state.job_no = embedding.step
-
-        shared.state.textinfo = f"""
+                shared.state.textinfo = f"""
 <p>
-Loss: {losses.mean():.7f}<br/>
-Step: {embedding.step}<br/>
-Last prompt: {html.escape(entries[0].cond_text)}<br/>
+Loss: {loss_step:.7f}<br/>
+Step: {steps_done}<br/>
+Last prompt: {html.escape(batch.cond_text[0])}<br/>
 Last saved embedding: {html.escape(last_saved_file)}<br/>
 Last saved image: {html.escape(last_saved_image)}<br/>
 </p>
 """
-
-    filename = os.path.join(shared.cmd_opts.embeddings_dir, f'{embedding_name}.pt')
-    save_embedding(embedding, checkpoint, embedding_name, filename, remove_cached_checksum=True)
-    shared.sd_model.first_stage_model.to(devices.device)
+        filename = os.path.join(shared.cmd_opts.embeddings_dir, f'{embedding_name}.pt')
+        save_embedding(embedding, optimizer, checkpoint, embedding_name, filename, remove_cached_checksum=True)
+    except Exception:
+        print(traceback.format_exc(), file=sys.stderr)
+        pass
+    finally:
+        pbar.leave = False
+        pbar.close()
+        shared.sd_model.first_stage_model.to(devices.device)
+        shared.parallel_processing_allowed = old_parallel_processing_allowed

    return embedding, filename

-def save_embedding(embedding, checkpoint, embedding_name, filename, remove_cached_checksum=True):
+def save_embedding(embedding, optimizer, checkpoint, embedding_name, filename, remove_cached_checksum=True):
    old_embedding_name = embedding.name
    old_sd_checkpoint = embedding.sd_checkpoint if hasattr(embedding, "sd_checkpoint") else None
    old_sd_checkpoint_name = embedding.sd_checkpoint_name if hasattr(embedding, "sd_checkpoint_name") else None
@ -436,6 +524,7 @@ def save_embedding(embedding, checkpoint, embedding_name, filename, remove_cache
        if remove_cached_checksum:
            embedding.cached_checksum = None
        embedding.name = embedding_name
+        embedding.optimizer_state_dict = optimizer.state_dict()
        embedding.save(filename)
    except:
        embedding.sd_checkpoint = old_sd_checkpoint
--- a/modules/textual_inversion/ui.py
+++ b/modules/textual_inversion/ui.py
@ -18,7 +18,7 @@ def create_embedding(name, initialization_text, nvpt, overwrite_old):
 def preprocess(*args):
    modules.textual_inversion.preprocess.preprocess(*args)

-    return "Preprocessing finished.", ""
+    return f"Preprocessing {'interrupted' if shared.state.interrupted else 'finished'}.", ""


 def train_embedding(*args):
--- a/modules/txt2img.py
+++ b/modules/txt2img.py
@ -1,4 +1,5 @@
 import modules.scripts
+from modules import sd_samplers
 from modules.processing import StableDiffusionProcessing, Processed, StableDiffusionProcessingTxt2Img, \
    StableDiffusionProcessingImg2Img, process_images
 from modules.shared import opts, cmd_opts
@ -7,7 +8,7 @@ import modules.processing as processing
 from modules.ui import plaintext_to_html


-def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, denoising_strength: float, firstphase_width: int, firstphase_height: int, *args):
+def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, denoising_strength: float, hr_scale: float, hr_upscaler: str, *args):
    p = StableDiffusionProcessingTxt2Img(
        sd_model=shared.sd_model,
        outpath_samples=opts.outdir_samples or opts.outdir_txt2img_samples,
@ -21,7 +22,7 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
        seed_resize_from_h=seed_resize_from_h,
        seed_resize_from_w=seed_resize_from_w,
        seed_enable_extras=seed_enable_extras,
-        sampler_index=sampler_index,
+        sampler_name=sd_samplers.samplers[sampler_index].name,
        batch_size=batch_size,
        n_iter=n_iter,
        steps=steps,
@ -32,8 +33,8 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
        tiling=tiling,
        enable_hr=enable_hr,
        denoising_strength=denoising_strength if enable_hr else None,
-        firstphase_width=firstphase_width if enable_hr else None,
-        firstphase_height=firstphase_height if enable_hr else None,
+        hr_scale=hr_scale,
+        hr_upscaler=hr_upscaler,
    )

    p.scripts = modules.scripts.scripts_txt2img
@ -58,4 +59,4 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
    if opts.do_not_show_images:
        processed.images = []

-    return processed.images, generation_info_js, plaintext_to_html(processed.info)
+    return processed.images, generation_info_js, plaintext_to_html(processed.info), plaintext_to_html(processed.comments)
--- a/modules/ui.py
+++ b/modules/ui.py
--- a/modules/ui_components.py
+++ b/modules/ui_components.py
@ -0,0 +1,25 @@
+import gradio as gr
+
+
+class ToolButton(gr.Button, gr.components.FormComponent):
+    """Small button with single emoji as text, fits inside gradio forms"""
+
+    def __init__(self, **kwargs):
+        super().__init__(variant="tool", **kwargs)
+
+    def get_block_name(self):
+        return "button"
+
+
+class FormRow(gr.Row, gr.components.FormComponent):
+    """Same as gr.Row but fits inside gradio forms"""
+
+    def get_block_name(self):
+        return "row"
+
+
+class FormGroup(gr.Group, gr.components.FormComponent):
+    """Same as gr.Row but fits inside gradio forms"""
+
+    def get_block_name(self):
+        return "group"
--- a/modules/ui_extensions.py
+++ b/modules/ui_extensions.py
@ -9,6 +9,8 @@ import git

 import gradio as gr
 import html
+import shutil
+import errno

 from modules import extensions, shared, paths

@ -17,7 +19,7 @@ available_extensions = {"extensions": []}


 def check_access():
-    assert not shared.cmd_opts.disable_extension_access, "extension access disabed because of commandline flags"
+    assert not shared.cmd_opts.disable_extension_access, "extension access disabled because of command line flags"


 def apply_and_restart(disable_list, update_list):
@ -36,9 +38,9 @@ def apply_and_restart(disable_list, update_list):
            continue

        try:
-            ext.pull()
+            ext.fetch_and_reset_hard()
        except Exception:
-            print(f"Error pulling updates for {ext.name}:", file=sys.stderr)
+            print(f"Error getting updates for {ext.name}:", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)

    shared.opts.disabled_extensions = disabled
@ -78,6 +80,12 @@ def extension_table():
    """

    for ext in extensions.extensions:
+        remote = ""
+        if ext.is_builtin:
+            remote = "built-in"
+        elif ext.remote:
+            remote = f"""<a href="{html.escape(ext.remote or '')}" target="_blank">{html.escape("built-in" if ext.is_builtin else ext.remote or '')}</a>"""
+
        if ext.can_update:
            ext_status = f"""<label><input class="gr-check-radio gr-checkbox" name="update_{html.escape(ext.name)}" checked="checked" type="checkbox">{html.escape(ext.status)}</label>"""
        else:
@ -86,7 +94,7 @@ def extension_table():
        code += f"""
            <tr>
                <td><label><input class="gr-check-radio gr-checkbox" name="enable_{html.escape(ext.name)}" type="checkbox" {'checked="checked"' if ext.enabled else ''}>{html.escape(ext.name)}</label></td>
-                <td><a href="{html.escape(ext.remote or '')}" target="_blank">{html.escape(ext.remote or '')}</a></td>
+                <td>{remote}</td>
                <td{' class="extension_status"' if ext.remote is not None else ''}>{ext_status}</td>
            </tr>
    """
@ -132,7 +140,21 @@ def install_extension_from_url(dirname, url):
        repo = git.Repo.clone_from(url, tmpdir)
        repo.remote().fetch()

-        os.rename(tmpdir, target_dir)
+        try:
+            os.rename(tmpdir, target_dir)
+        except OSError as err:
+            # TODO what does this do on windows? I think it'll be a different error code but I don't have a system to check it
+            # Shouldn't cause any new issues at least but we probably want to handle it there too.
+            if err.errno == errno.EXDEV:
+                # Cross device link, typical in docker or when tmp/ and extensions/ are on different file systems
+                # Since we can't use a rename, do the slower but more versitile shutil.move()
+                shutil.move(tmpdir, target_dir)
+            else:
+                # Something else, not enough free space, permissions, etc.  rethrow it so that it gets handled.
+                raise(err)
+
+        import launch
+        launch.run_extension_installer(target_dir)

        extensions.list_extensions()
        return [extension_table(), html.escape(f"Installed into {target_dir}. Use Installed tab to restart.")]
@ -197,12 +219,13 @@ def refresh_available_extensions_from_data(hide_tags):
        if url is None:
            continue

+        existing = installed_extension_urls.get(normalize_git_url(url), None)
+        extension_tags = extension_tags + ["installed"] if existing else extension_tags
+
        if len([x for x in extension_tags if x in tags_to_hide]) > 0:
            hidden += 1
            continue

-        existing = installed_extension_urls.get(normalize_git_url(url), None)
-
        install_code = f"""<input onclick="install_extension_from_index(this, '{html.escape(url)}')" type="button" value="{"Install" if not existing else "Installed"}" {"disabled=disabled" if existing else ""} class="gr-button gr-button-lg gr-button-secondary">"""

        tags_text = ", ".join([f"<span class='extension-tag' title='{tags.get(x, '')}'>{x}</span>" for x in extension_tags])
@ -213,7 +236,11 @@ def refresh_available_extensions_from_data(hide_tags):
                <td>{html.escape(description)}</td>
                <td>{install_code}</td>
            </tr>
-    """
+        
+        """
+
+        for tag in [x for x in extension_tags if x not in tags]:
+            tags[tag] = tag

    code += """
        </tbody>
@ -263,7 +290,7 @@ def create_ui():
                    install_extension_button = gr.Button(elem_id="install_extension_button", visible=False)

                with gr.Row():
-                    hide_tags = gr.CheckboxGroup(value=["ads", "localization"], label="Hide extensions with tags", choices=["script", "ads", "localization"])
+                    hide_tags = gr.CheckboxGroup(value=["ads", "localization", "installed"], label="Hide extensions with tags", choices=["script", "ads", "localization", "installed"])

                install_result = gr.HTML()
                available_extensions_table = gr.HTML()
--- a/modules/ui_tempdir.py
+++ b/modules/ui_tempdir.py
@ -0,0 +1,82 @@
+import os
+import tempfile
+from collections import namedtuple
+from pathlib import Path
+
+import gradio as gr
+
+from PIL import PngImagePlugin
+
+from modules import shared
+
+
+Savedfile = namedtuple("Savedfile", ["name"])
+
+
+def register_tmp_file(gradio, filename):
+    if hasattr(gradio, 'temp_file_sets'):  # gradio 3.15
+        gradio.temp_file_sets[0] = gradio.temp_file_sets[0] | {os.path.abspath(filename)}
+
+    if hasattr(gradio, 'temp_dirs'):  # gradio 3.9
+        gradio.temp_dirs = gradio.temp_dirs | {os.path.abspath(os.path.dirname(filename))}
+
+
+def check_tmp_file(gradio, filename):
+    if hasattr(gradio, 'temp_file_sets'):
+        return any([filename in fileset for fileset in gradio.temp_file_sets])
+
+    if hasattr(gradio, 'temp_dirs'):
+        return any(Path(temp_dir).resolve() in Path(filename).resolve().parents for temp_dir in gradio.temp_dirs)
+
+    return False
+
+
+def save_pil_to_file(pil_image, dir=None):
+    already_saved_as = getattr(pil_image, 'already_saved_as', None)
+    if already_saved_as and os.path.isfile(already_saved_as):
+        register_tmp_file(shared.demo, already_saved_as)
+
+        file_obj = Savedfile(already_saved_as)
+        return file_obj
+
+    if shared.opts.temp_dir != "":
+        dir = shared.opts.temp_dir
+
+    use_metadata = False
+    metadata = PngImagePlugin.PngInfo()
+    for key, value in pil_image.info.items():
+        if isinstance(key, str) and isinstance(value, str):
+            metadata.add_text(key, value)
+            use_metadata = True
+
+    file_obj = tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=dir)
+    pil_image.save(file_obj, pnginfo=(metadata if use_metadata else None))
+    return file_obj
+
+
+# override save to file function so that it also writes PNG info
+gr.processing_utils.save_pil_to_file = save_pil_to_file
+
+
+def on_tmpdir_changed():
+    if shared.opts.temp_dir == "" or shared.demo is None:
+        return
+
+    os.makedirs(shared.opts.temp_dir, exist_ok=True)
+
+    register_tmp_file(shared.demo, os.path.join(shared.opts.temp_dir, "x"))
+
+
+def cleanup_tmpdr():
+    temp_dir = shared.opts.temp_dir
+    if temp_dir == "" or not os.path.isdir(temp_dir):
+        return
+
+    for root, dirs, files in os.walk(temp_dir, topdown=False):
+        for name in files:
+            _, extension = os.path.splitext(name)
+            if extension != ".png":
+                continue
+
+            filename = os.path.join(root, name)
+            os.remove(filename)
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@ -53,10 +53,10 @@ class Upscaler:
    def do_upscale(self, img: PIL.Image, selected_model: str):
        return img

-    def upscale(self, img: PIL.Image, scale: int, selected_model: str = None):
+    def upscale(self, img: PIL.Image, scale, selected_model: str = None):
        self.scale = scale
-        dest_w = img.width * scale
-        dest_h = img.height * scale
+        dest_w = int(img.width * scale)
+        dest_h = int(img.height * scale)

        for i in range(3):
            shape = (img.width, img.height)
--- a/modules/xlmr.py
+++ b/modules/xlmr.py
@ -0,0 +1,137 @@
+from transformers import BertPreTrainedModel,BertModel,BertConfig
+import torch.nn as nn
+import torch
+from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
+from transformers import XLMRobertaModel,XLMRobertaTokenizer
+from typing import Optional
+
+class BertSeriesConfig(BertConfig):
+    def __init__(self, vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", use_cache=True, classifier_dropout=None,project_dim=512, pooler_fn="average",learn_encoder=False,model_type='bert',**kwargs):
+
+        super().__init__(vocab_size, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, type_vocab_size, initializer_range, layer_norm_eps, pad_token_id, position_embedding_type, use_cache, classifier_dropout, **kwargs)
+        self.project_dim = project_dim
+        self.pooler_fn = pooler_fn
+        self.learn_encoder = learn_encoder
+
+class RobertaSeriesConfig(XLMRobertaConfig):
+    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2,project_dim=512,pooler_fn='cls',learn_encoder=False, **kwargs):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.project_dim = project_dim
+        self.pooler_fn = pooler_fn
+        self.learn_encoder = learn_encoder
+
+
+class BertSeriesModelWithTransformation(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    config_class = BertSeriesConfig
+
+    def __init__(self, config=None, **kargs):
+        # modify initialization for autoloading 
+        if config is None:
+            config = XLMRobertaConfig()
+            config.attention_probs_dropout_prob= 0.1
+            config.bos_token_id=0
+            config.eos_token_id=2
+            config.hidden_act='gelu'
+            config.hidden_dropout_prob=0.1
+            config.hidden_size=1024
+            config.initializer_range=0.02
+            config.intermediate_size=4096
+            config.layer_norm_eps=1e-05
+            config.max_position_embeddings=514
+
+            config.num_attention_heads=16
+            config.num_hidden_layers=24
+            config.output_past=True
+            config.pad_token_id=1
+            config.position_embedding_type= "absolute"
+
+            config.type_vocab_size= 1
+            config.use_cache=True
+            config.vocab_size= 250002
+            config.project_dim = 768
+            config.learn_encoder = False
+        super().__init__(config)
+        self.roberta = XLMRobertaModel(config)
+        self.transformation = nn.Linear(config.hidden_size,config.project_dim)
+        self.pre_LN=nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        self.pooler = lambda x: x[:,0]
+        self.post_init()
+
+    def encode(self,c):
+        device = next(self.parameters()).device
+        text = self.tokenizer(c,
+                        truncation=True,
+                        max_length=77,
+                        return_length=False,
+                        return_overflowing_tokens=False,
+                        padding="max_length",
+                        return_tensors="pt")
+        text["input_ids"] = torch.tensor(text["input_ids"]).to(device)
+        text["attention_mask"] = torch.tensor(
+            text['attention_mask']).to(device)
+        features = self(**text)
+        return features['projection_state'] 
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) :
+        r"""
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        # last module outputs
+        sequence_output = outputs[0]
+
+
+        # project every module
+        sequence_output_ln = self.pre_LN(sequence_output)
+
+        # pooler
+        pooler_output = self.pooler(sequence_output_ln)
+        pooler_output = self.transformation(pooler_output)
+        projection_state = self.transformation(outputs.last_hidden_state)
+
+        return {
+            'pooler_output':pooler_output,
+            'last_hidden_state':outputs.last_hidden_state,
+            'hidden_states':outputs.hidden_states,
+            'attentions':outputs.attentions,
+            'projection_state':projection_state,
+            'sequence_out': sequence_output
+        }
+
+
+class RobertaSeriesModelWithTransformation(BertSeriesModelWithTransformation):
+    base_model_prefix = 'roberta'
+    config_class= RobertaSeriesConfig