Merge branch 'dev' into extra-networks-performance-updates

2026-02-03 22:31:42 -08:00 · 2024-06-21 09:45:17 -04:00 · 2024-06-21 09:45:17 -04:00 · e147b579ca
commit e147b579ca
parent 11ed5f6ecb a30b19dd55
61 changed files with 765 additions and 325 deletions
--- a/modules/api/api.py
+++ b/modules/api/api.py
@ -438,15 +438,19 @@ class Api:
        self.apply_infotext(txt2imgreq, "txt2img", script_runner=script_runner, mentioned_script_args=infotext_script_args)

        selectable_scripts, selectable_script_idx = self.get_selectable_script(txt2imgreq.script_name, script_runner)
+        sampler, scheduler = sd_samplers.get_sampler_and_scheduler(txt2imgreq.sampler_name or txt2imgreq.sampler_index, txt2imgreq.scheduler)

        populate = txt2imgreq.copy(update={  # Override __init__ params
-            "sampler_name": validate_sampler_name(txt2imgreq.sampler_name or txt2imgreq.sampler_index),
+            "sampler_name": validate_sampler_name(sampler),
            "do_not_save_samples": not txt2imgreq.save_images,
            "do_not_save_grid": not txt2imgreq.save_images,
        })
        if populate.sampler_name:
            populate.sampler_index = None  # prevent a warning later on

+        if not populate.scheduler and scheduler != "Automatic":
+            populate.scheduler = scheduler
+
        args = vars(populate)
        args.pop('script_name', None)
        args.pop('script_args', None) # will refeed them to the pipeline directly after initializing them
@ -502,9 +506,10 @@ class Api:
        self.apply_infotext(img2imgreq, "img2img", script_runner=script_runner, mentioned_script_args=infotext_script_args)

        selectable_scripts, selectable_script_idx = self.get_selectable_script(img2imgreq.script_name, script_runner)
+        sampler, scheduler = sd_samplers.get_sampler_and_scheduler(img2imgreq.sampler_name or img2imgreq.sampler_index, img2imgreq.scheduler)

        populate = img2imgreq.copy(update={  # Override __init__ params
-            "sampler_name": validate_sampler_name(img2imgreq.sampler_name or img2imgreq.sampler_index),
+            "sampler_name": validate_sampler_name(sampler),
            "do_not_save_samples": not img2imgreq.save_images,
            "do_not_save_grid": not img2imgreq.save_images,
            "mask": mask,
@ -512,6 +517,9 @@ class Api:
        if populate.sampler_name:
            populate.sampler_index = None  # prevent a warning later on

+        if not populate.scheduler and scheduler != "Automatic":
+            populate.scheduler = scheduler
+
        args = vars(populate)
        args.pop('include_init_images', None)  # this is meant to be done by "exclude": True in model, but it's for a reason that I cannot determine.
        args.pop('script_name', None)
--- a/modules/call_queue.py
+++ b/modules/call_queue.py
@ -1,8 +1,9 @@
+import os.path
 from functools import wraps
 import html
 import time

-from modules import shared, progress, errors, devices, fifo_lock
+from modules import shared, progress, errors, devices, fifo_lock, profiling

 queue_lock = fifo_lock.FIFOLock()

@ -111,8 +112,13 @@ def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
        else:
            vram_html = ''

+        if shared.opts.profiling_enable and os.path.exists(shared.opts.profiling_filename):
+            profiling_html = f"<p class='profile'> [ <a href='{profiling.webpath()}' download>Profile</a> ] </p>"
+        else:
+            profiling_html = ''
+
        # last item is always HTML
-        res[-1] += f"<div class='performance'><p class='time'>Time taken: <wbr><span class='measurement'>{elapsed_text}</span></p>{vram_html}</div>"
+        res[-1] += f"<div class='performance'><p class='time'>Time taken: <wbr><span class='measurement'>{elapsed_text}</span></p>{vram_html}{profiling_html}</div>"

        return tuple(res)

--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@ -20,6 +20,7 @@ parser.add_argument("--dump-sysinfo", action='store_true', help="launch.py argum
 parser.add_argument("--loglevel", type=str, help="log level; one of: CRITICAL, ERROR, WARNING, INFO, DEBUG", default=None)
 parser.add_argument("--do-not-download-clip", action='store_true', help="do not download CLIP model even if it's not included in the checkpoint")
 parser.add_argument("--data-dir", type=normalized_filepath, default=os.path.dirname(os.path.dirname(os.path.realpath(__file__))), help="base path where all user data is stored")
+parser.add_argument("--models-dir", type=normalized_filepath, default=None, help="base path where models are stored; overrides --data-dir")
 parser.add_argument("--config", type=normalized_filepath, default=sd_default_config, help="path to config which constructs model",)
 parser.add_argument("--ckpt", type=normalized_filepath, default=sd_model_file, help="path to checkpoint of stable diffusion model; if specified, this checkpoint will be added to the list of checkpoints and loaded",)
 parser.add_argument("--ckpt-dir", type=normalized_filepath, default=None, help="Path to directory with stable diffusion checkpoints")
@ -41,7 +42,7 @@ parser.add_argument("--lowvram", action='store_true', help="enable stable diffus
 parser.add_argument("--lowram", action='store_true', help="load stable diffusion checkpoint weights to VRAM instead of RAM")
 parser.add_argument("--always-batch-cond-uncond", action='store_true', help="does not do anything")
 parser.add_argument("--unload-gfpgan", action='store_true', help="does not do anything.")
-parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast")
+parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "half", "autocast"], default="autocast")
 parser.add_argument("--upcast-sampling", action='store_true', help="upcast sampling. No effect with --no-half. Usually produces similar results to --no-half with better performance while using less memory.")
 parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site")
 parser.add_argument("--ngrok", type=str, help="ngrok authtoken, alternative to gradio --share", default=None)
--- a/modules/devices.py
+++ b/modules/devices.py
@ -114,6 +114,9 @@ errors.run(enable_tf32, "Enabling TF32")

 cpu: torch.device = torch.device("cpu")
 fp8: bool = False
+# Force fp16 for all models in inference. No casting during inference.
+# This flag is controlled by "--precision half" command line arg.
+force_fp16: bool = False
 device: torch.device = None
 device_interrogate: torch.device = None
 device_gfpgan: torch.device = None
@ -127,6 +130,8 @@ unet_needs_upcast = False


 def cond_cast_unet(input):
+    if force_fp16:
+        return input.to(torch.float16)
    return input.to(dtype_unet) if unet_needs_upcast else input


@ -206,6 +211,11 @@ def autocast(disable=False):
    if disable:
        return contextlib.nullcontext()

+    if force_fp16:
+        # No casting during inference if force_fp16 is enabled.
+        # All tensor dtype conversion happens before inference.
+        return contextlib.nullcontext()
+
    if fp8 and device==cpu:
        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)

@ -233,22 +243,22 @@ def test_for_nans(x, where):
    if shared.cmd_opts.disable_nan_check:
        return

-    if not torch.all(torch.isnan(x)).item():
+    if not torch.isnan(x[(0, ) * len(x.shape)]):
        return

    if where == "unet":
-        message = "A tensor with all NaNs was produced in Unet."
+        message = "A tensor with NaNs was produced in Unet."

        if not shared.cmd_opts.no_half:
            message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."

    elif where == "vae":
-        message = "A tensor with all NaNs was produced in VAE."
+        message = "A tensor with NaNs was produced in VAE."

        if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:
            message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
    else:
-        message = "A tensor with all NaNs was produced."
+        message = "A tensor with NaNs was produced."

    message += " Use --disable-nan-check commandline argument to disable this check."

@ -269,3 +279,17 @@ def first_time_calculation():
    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
    conv2d(x)
+
+
+def force_model_fp16():
+    """
+    ldm and sgm has modules.diffusionmodules.util.GroupNorm32.forward, which
+    force conversion of input to float32. If force_fp16 is enabled, we need to
+    prevent this casting.
+    """
+    assert force_fp16
+    import sgm.modules.diffusionmodules.util as sgm_util
+    import ldm.modules.diffusionmodules.util as ldm_util
+    sgm_util.GroupNorm32 = torch.nn.GroupNorm
+    ldm_util.GroupNorm32 = torch.nn.GroupNorm
+    print("ldm/sgm GroupNorm32 replaced with normal torch.nn.GroupNorm due to `--precision half`.")
--- a/modules/extensions.py
+++ b/modules/extensions.py
@ -191,8 +191,9 @@ class Extension:

    def check_updates(self):
        repo = Repo(self.path)
+        branch_name = f'{repo.remote().name}/{self.branch}'
        for fetch in repo.remote().fetch(dry_run=True):
-            if self.branch and fetch.name != f'{repo.remote().name}/{self.branch}':
+            if self.branch and fetch.name != branch_name:
                continue
            if fetch.flags != fetch.HEAD_UPTODATE:
                self.can_update = True
@ -200,7 +201,7 @@ class Extension:
                return

        try:
-            origin = repo.rev_parse('origin')
+            origin = repo.rev_parse(branch_name)
            if repo.head.commit != origin:
                self.can_update = True
                self.status = "behind HEAD"
@ -213,8 +214,10 @@ class Extension:
        self.can_update = False
        self.status = "latest"

-    def fetch_and_reset_hard(self, commit='origin'):
+    def fetch_and_reset_hard(self, commit=None):
        repo = Repo(self.path)
+        if commit is None:
+            commit = f'{repo.remote().name}/{self.branch}'
        # Fix: `error: Your local changes to the following files would be overwritten by merge`,
        # because WSL2 Docker set 755 file permissions instead of 644, this results to the error.
        repo.git.fetch(all=True)
--- a/modules/images.py
+++ b/modules/images.py
@ -54,11 +54,14 @@ def image_grid(imgs, batch_size=1, rows=None):
    params = script_callbacks.ImageGridLoopParams(imgs, cols, rows)
    script_callbacks.image_grid_callback(params)

-    w, h = imgs[0].size
-    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color='black')
+    w, h = map(max, zip(*(img.size for img in imgs)))
+    grid_background_color = ImageColor.getcolor(opts.grid_background_color, 'RGB')
+    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color=grid_background_color)

    for i, img in enumerate(params.imgs):
-        grid.paste(img, box=(i % params.cols * w, i // params.cols * h))
+        img_w, img_h = img.size
+        w_offset, h_offset = 0 if img_w == w else (w - img_w) // 2, 0 if img_h == h else (h - img_h) // 2
+        grid.paste(img, box=(i % params.cols * w + w_offset, i // params.cols * h + h_offset))

    return grid

@ -377,6 +380,7 @@ def get_sampler_scheduler(p, sampler):

 class FilenameGenerator:
    replacements = {
+        'basename': lambda self: self.basename or 'img',
        'seed': lambda self: self.seed if self.seed is not None else '',
        'seed_first': lambda self: self.seed if self.p.batch_size == 1 else self.p.all_seeds[0],
        'seed_last': lambda self: NOTHING_AND_SKIP_PREVIOUS_TEXT if self.p.batch_size == 1 else self.p.all_seeds[-1],
@ -413,12 +417,13 @@ class FilenameGenerator:
    }
    default_time_format = '%Y%m%d%H%M%S'

-    def __init__(self, p, seed, prompt, image, zip=False):
+    def __init__(self, p, seed, prompt, image, zip=False, basename=""):
        self.p = p
        self.seed = seed
        self.prompt = prompt
        self.image = image
        self.zip = zip
+        self.basename = basename

    def get_vae_filename(self):
        """Get the name of the VAE file."""
@ -606,9 +611,10 @@ def save_image_with_geninfo(image, geninfo, filename, extension=None, existing_p
                    piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(geninfo or "", encoding="unicode")
                },
            })
+        else:
+            exif_bytes = None

-
-        image.save(filename,format=image_format, exif=exif_bytes)
+        image.save(filename,format=image_format, quality=opts.jpeg_quality, exif=exif_bytes)
    elif extension.lower() == ".gif":
        image.save(filename, format=image_format, comment=geninfo)
    else:
@ -648,12 +654,12 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
        txt_fullfn (`str` or None):
            If a text file is saved for this image, this will be its full path. Otherwise None.
    """
-    namegen = FilenameGenerator(p, seed, prompt, image)
+    namegen = FilenameGenerator(p, seed, prompt, image, basename=basename)

    # WebP and JPG formats have maximum dimension limits of 16383 and 65535 respectively. switch to PNG which has a much higher limit
    if (image.height > 65535 or image.width > 65535) and extension.lower() in ("jpg", "jpeg") or (image.height > 16383 or image.width > 16383) and extension.lower() == "webp":
        print('Image dimensions too large; saving as PNG')
-        extension = ".png"
+        extension = "png"

    if save_to_dirs is None:
        save_to_dirs = (grid and opts.grid_save_to_dirs) or (not grid and opts.save_to_dirs and not no_prompt)
@ -789,7 +795,10 @@ def read_info_from_image(image: Image.Image) -> tuple[str | None, dict]:
        if exif_comment:
            geninfo = exif_comment
    elif "comment" in items: # for gif
-        geninfo = items["comment"].decode('utf8', errors="ignore")
+        if isinstance(items["comment"], bytes):
+            geninfo = items["comment"].decode('utf8', errors="ignore")
+        else:
+            geninfo = items["comment"]

    for field in IGNORED_INFO_KEYS:
        items.pop(field, None)
--- a/modules/img2img.py
+++ b/modules/img2img.py
@ -17,11 +17,14 @@ from modules.ui import plaintext_to_html
 import modules.scripts


-def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
+def process_batch(p, input, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
    output_dir = output_dir.strip()
    processing.fix_seed(p)

-    batch_images = list(shared.walk_files(input_dir, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+    if isinstance(input, str):
+        batch_images = list(shared.walk_files(input, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+    else:
+        batch_images = [os.path.abspath(x.name) for x in input]

    is_inpaint_batch = False
    if inpaint_mask_dir:
@ -146,7 +149,7 @@ def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=Fal
    return batch_results


-def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, *args):
+def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, img2img_batch_source_type: str, img2img_batch_upload: list, *args):
    override_settings = create_override_settings_dict(override_settings_texts)

    is_batch = mode == 5
@ -221,8 +224,15 @@ def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_

    with closing(p):
        if is_batch:
-            assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
-            processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)
+            if img2img_batch_source_type == "upload":
+                assert isinstance(img2img_batch_upload, list) and img2img_batch_upload
+                output_dir = ""
+                inpaint_mask_dir = ""
+                png_info_dir = img2img_batch_png_info_dir if not shared.cmd_opts.hide_ui_dir_config else ""
+                processed = process_batch(p, img2img_batch_upload, output_dir, inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=png_info_dir)
+            else: # "from dir"
+                assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
+                processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)

            if processed is None:
                processed = Processed(p, [], p.seed, "")
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@ -76,7 +76,7 @@ def git_tag():
    except Exception:
        try:

-            changelog_md = os.path.join(os.path.dirname(os.path.dirname(__file__)), "CHANGELOG.md")
+            changelog_md = os.path.join(script_path, "CHANGELOG.md")
            with open(changelog_md, "r", encoding="utf-8") as file:
                line = next((line.strip() for line in file if line.strip()), "<none>")
                line = line.replace("## ", "")
@ -231,7 +231,7 @@ def run_extension_installer(extension_dir):

    try:
        env = os.environ.copy()
-        env['PYTHONPATH'] = f"{os.path.abspath('.')}{os.pathsep}{env.get('PYTHONPATH', '')}"
+        env['PYTHONPATH'] = f"{script_path}{os.pathsep}{env.get('PYTHONPATH', '')}"

        stdout = run(f'"{python}" "{path_installer}"', errdesc=f"Error running install.py for extension {extension_dir}", custom_env=env).strip()
        if stdout:
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@ -23,6 +23,7 @@ def load_file_from_url(
    model_dir: str,
    progress: bool = True,
    file_name: str | None = None,
+    hash_prefix: str | None = None,
 ) -> str:
    """Download a file from `url` into `model_dir`, using the file present if possible.

@ -36,11 +37,11 @@ def load_file_from_url(
    if not os.path.exists(cached_file):
        print(f'Downloading: "{url}" to {cached_file}\n')
        from torch.hub import download_url_to_file
-        download_url_to_file(url, cached_file, progress=progress)
+        download_url_to_file(url, cached_file, progress=progress, hash_prefix=hash_prefix)
    return cached_file


-def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None) -> list:
+def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None, hash_prefix=None) -> list:
    """
    A one-and done loader to try finding the desired models in specified directories.

@ -49,6 +50,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None
    @param model_path: The location to store/find models in.
    @param command_path: A command-line argument to search for models in first.
    @param ext_filter: An optional list of filename extensions to filter by
+    @param hash_prefix: the expected sha256 of the model_url
    @return: A list of paths containing the desired model(s)
    """
    output = []
@ -78,7 +80,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None

        if model_url is not None and len(output) == 0:
            if download_name is not None:
-                output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name))
+                output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name, hash_prefix=hash_prefix))
            else:
                output.append(model_url)

--- a/modules/paths_internal.py
+++ b/modules/paths_internal.py
@ -24,11 +24,12 @@ default_sd_model_file = sd_model_file
 # Parse the --data-dir flag first so we can use it as a base for our other argument default values
 parser_pre = argparse.ArgumentParser(add_help=False)
 parser_pre.add_argument("--data-dir", type=str, default=os.path.dirname(modules_path), help="base path where all user data is stored", )
+parser_pre.add_argument("--models-dir", type=str, default=None, help="base path where models are stored; overrides --data-dir", )
 cmd_opts_pre = parser_pre.parse_known_args()[0]

 data_path = cmd_opts_pre.data_dir

-models_path = os.path.join(data_path, "models")
+models_path = cmd_opts_pre.models_dir if cmd_opts_pre.models_dir else os.path.join(data_path, "models")
 extensions_dir = os.path.join(data_path, "extensions")
 extensions_builtin_dir = os.path.join(script_path, "extensions-builtin")
 config_states_dir = os.path.join(script_path, "config_states")
--- a/modules/postprocessing.py
+++ b/modules/postprocessing.py
@ -62,11 +62,13 @@ def run_postprocessing(extras_mode, image, image_folder, input_dir, output_dir,
        else:
            image_data = image_placeholder

+        image_data = image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB")
+
        parameters, existing_pnginfo = images.read_info_from_image(image_data)
        if parameters:
            existing_pnginfo["parameters"] = parameters

-        initial_pp = scripts_postprocessing.PostprocessedImage(image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB"))
+        initial_pp = scripts_postprocessing.PostprocessedImage(image_data)

        scripts.scripts_postproc.run(initial_pp, args)

--- a/modules/processing.py
+++ b/modules/processing.py
@ -16,7 +16,7 @@ from skimage import exposure
 from typing import Any

 import modules.sd_hijack
-from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng
+from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng, profiling
 from modules.rng import slerp # noqa: F401
 from modules.sd_hijack import model_hijack
 from modules.sd_samplers_common import images_tensor_to_samples, decode_first_stage, approximation_indexes
@ -115,20 +115,17 @@ def txt2img_image_conditioning(sd_model, x, width, height):
        return x.new_zeros(x.shape[0], 2*sd_model.noise_augmentor.time_embed.dim, dtype=x.dtype, device=x.device)

    else:
-        sd = sd_model.model.state_dict()
-        diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
-        if diffusion_model_input is not None:
-            if diffusion_model_input.shape[1] == 9:
-                # The "masked-image" in this case will just be all 0.5 since the entire image is masked.
-                image_conditioning = torch.ones(x.shape[0], 3, height, width, device=x.device) * 0.5
-                image_conditioning = images_tensor_to_samples(image_conditioning,
-                                                              approximation_indexes.get(opts.sd_vae_encode_method))
+        if sd_model.is_sdxl_inpaint:
+            # The "masked-image" in this case will just be all 0.5 since the entire image is masked.
+            image_conditioning = torch.ones(x.shape[0], 3, height, width, device=x.device) * 0.5
+            image_conditioning = images_tensor_to_samples(image_conditioning,
+                                                            approximation_indexes.get(opts.sd_vae_encode_method))

-                # Add the fake full 1s mask to the first dimension.
-                image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
-                image_conditioning = image_conditioning.to(x.dtype)
+            # Add the fake full 1s mask to the first dimension.
+            image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
+            image_conditioning = image_conditioning.to(x.dtype)

-                return image_conditioning
+            return image_conditioning

        # Dummy zero conditioning if we're not using inpainting or unclip models.
        # Still takes up a bit of memory, but no encoder call.
@ -238,11 +235,6 @@ class StableDiffusionProcessing:
            self.styles = []

        self.sampler_noise_scheduler_override = None
-        self.s_min_uncond = self.s_min_uncond if self.s_min_uncond is not None else opts.s_min_uncond
-        self.s_churn = self.s_churn if self.s_churn is not None else opts.s_churn
-        self.s_tmin = self.s_tmin if self.s_tmin is not None else opts.s_tmin
-        self.s_tmax = (self.s_tmax if self.s_tmax is not None else opts.s_tmax) or float('inf')
-        self.s_noise = self.s_noise if self.s_noise is not None else opts.s_noise

        self.extra_generation_params = self.extra_generation_params or {}
        self.override_settings = self.override_settings or {}
@ -259,6 +251,13 @@ class StableDiffusionProcessing:
        self.cached_uc = StableDiffusionProcessing.cached_uc
        self.cached_c = StableDiffusionProcessing.cached_c

+    def fill_fields_from_opts(self):
+        self.s_min_uncond = self.s_min_uncond if self.s_min_uncond is not None else opts.s_min_uncond
+        self.s_churn = self.s_churn if self.s_churn is not None else opts.s_churn
+        self.s_tmin = self.s_tmin if self.s_tmin is not None else opts.s_tmin
+        self.s_tmax = (self.s_tmax if self.s_tmax is not None else opts.s_tmax) or float('inf')
+        self.s_noise = self.s_noise if self.s_noise is not None else opts.s_noise
+
    @property
    def sd_model(self):
        return shared.sd_model
@ -390,11 +389,8 @@ class StableDiffusionProcessing:
        if self.sampler.conditioning_key == "crossattn-adm":
            return self.unclip_image_conditioning(source_image)

-        sd = self.sampler.model_wrap.inner_model.model.state_dict()
-        diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
-        if diffusion_model_input is not None:
-            if diffusion_model_input.shape[1] == 9:
-                return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
+        if self.sampler.model_wrap.inner_model.is_sdxl_inpaint:
+            return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)

        # Dummy zero conditioning if we're not using inpainting or depth model.
        return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
@ -569,7 +565,7 @@ class Processed:
        self.all_negative_prompts = all_negative_prompts or p.all_negative_prompts or [self.negative_prompt]
        self.all_seeds = all_seeds or p.all_seeds or [self.seed]
        self.all_subseeds = all_subseeds or p.all_subseeds or [self.subseed]
-        self.infotexts = infotexts or [info]
+        self.infotexts = infotexts or [info] * len(images_list)
        self.version = program_version()

    def js(self):
@ -629,6 +625,9 @@ class DecodedSamples(list):
 def decode_latent_batch(model, batch, target_device=None, check_for_nans=False):
    samples = DecodedSamples()

+    if check_for_nans:
+        devices.test_for_nans(batch, "unet")
+
    for i in range(batch.shape[0]):
        sample = decode_first_stage(model, batch[i:i + 1])[0]

@ -794,7 +793,6 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
        "Token merging ratio hr": None if not enable_hr or token_merging_ratio_hr == 0 else token_merging_ratio_hr,
        "Init image hash": getattr(p, 'init_img_hash', None),
        "RNG": opts.randn_source if opts.randn_source != "GPU" else None,
-        "NGMS": None if p.s_min_uncond == 0 else p.s_min_uncond,
        "Tiling": "True" if p.tiling else None,
        **p.extra_generation_params,
        "Version": program_version() if opts.add_version_to_infotext else None,
@ -842,7 +840,11 @@ def process_images(p: StableDiffusionProcessing) -> Processed:

        sd_models.apply_token_merging(p.sd_model, p.get_token_merging_ratio())

-        res = process_images_inner(p)
+        # backwards compatibility, fix sampler and scheduler if invalid
+        sd_samplers.fix_p_invalid_sampler_and_scheduler(p)
+
+        with profiling.Profiler():
+            res = process_images_inner(p)

    finally:
        sd_models.apply_token_merging(p.sd_model, 0)
@ -890,6 +892,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
    modules.sd_hijack.model_hijack.apply_circular(p.tiling)
    modules.sd_hijack.model_hijack.clear_comments()

+    p.fill_fields_from_opts()
    p.setup_prompts()

    if isinstance(seed, list):
@ -988,6 +991,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
            if getattr(samples_ddim, 'already_decoded', False):
                x_samples_ddim = samples_ddim
            else:
+                devices.test_for_nans(samples_ddim, "unet")
+
                if opts.sd_vae_decode_method != 'Full':
                    p.extra_generation_params['VAE Decoder'] = opts.sd_vae_decode_method
                x_samples_ddim = decode_latent_batch(p.sd_model, samples_ddim, target_device=devices.cpu, check_for_nans=True)
@ -1325,6 +1330,15 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
            # here we generate an image normally

            x = self.rng.next()
+            if self.scripts is not None:
+                self.scripts.process_before_every_sampling(
+                    p=self,
+                    x=x,
+                    noise=x,
+                    c=conditioning,
+                    uc=unconditional_conditioning
+                )
+
            samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x))
            del x

@ -1425,6 +1439,13 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):

        if self.scripts is not None:
            self.scripts.before_hr(self)
+            self.scripts.process_before_every_sampling(
+                p=self,
+                x=samples,
+                noise=noise,
+                c=self.hr_c,
+                uc=self.hr_uc,
+            )

        samples = self.sampler.sample_img2img(self, samples, noise, self.hr_c, self.hr_uc, steps=self.hr_second_pass_steps or self.steps, image_conditioning=image_conditioning)

@ -1738,6 +1759,14 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
            self.extra_generation_params["Noise multiplier"] = self.initial_noise_multiplier
            x *= self.initial_noise_multiplier

+        if self.scripts is not None:
+            self.scripts.process_before_every_sampling(
+                p=self,
+                x=self.init_latent,
+                noise=x,
+                c=conditioning,
+                uc=unconditional_conditioning
+            )
        samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning, image_conditioning=self.image_conditioning)

        if self.mask is not None:
--- a/modules/profiling.py
+++ b/modules/profiling.py
@ -0,0 +1,46 @@
+import torch
+
+from modules import shared, ui_gradio_extensions
+
+
+class Profiler:
+    def __init__(self):
+        if not shared.opts.profiling_enable:
+            self.profiler = None
+            return
+
+        activities = []
+        if "CPU" in shared.opts.profiling_activities:
+            activities.append(torch.profiler.ProfilerActivity.CPU)
+        if "CUDA" in shared.opts.profiling_activities:
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+
+        if not activities:
+            self.profiler = None
+            return
+
+        self.profiler = torch.profiler.profile(
+            activities=activities,
+            record_shapes=shared.opts.profiling_record_shapes,
+            profile_memory=shared.opts.profiling_profile_memory,
+            with_stack=shared.opts.profiling_with_stack
+        )
+
+    def __enter__(self):
+        if self.profiler:
+            self.profiler.__enter__()
+
+        return self
+
+    def __exit__(self, exc_type, exc, exc_tb):
+        if self.profiler:
+            shared.state.textinfo = "Finishing profile..."
+
+            self.profiler.__exit__(exc_type, exc, exc_tb)
+
+            self.profiler.export_chrome_trace(shared.opts.profiling_filename)
+
+
+def webpath():
+    return ui_gradio_extensions.webpath(shared.opts.profiling_filename)
+
--- a/modules/safe.py
+++ b/modules/safe.py
@ -64,8 +64,8 @@ class RestrictedUnpickler(pickle.Unpickler):
        raise Exception(f"global '{module}/{name}' is forbidden")


-# Regular expression that accepts 'dirname/version', 'dirname/data.pkl', and 'dirname/data/<number>'
-allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|(data\.pkl))$")
+# Regular expression that accepts 'dirname/version', 'dirname/byteorder', 'dirname/data.pkl', '.data/serialization_id', and 'dirname/data/<number>'
+allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|byteorder|.data/serialization_id|(data\.pkl))$")
 data_pkl_re = re.compile(r"^([^/]+)/data\.pkl$")

 def check_zip_filenames(filename, names):
--- a/modules/scripts.py
+++ b/modules/scripts.py
@ -187,6 +187,13 @@ class Script:
        """
        pass

+    def process_before_every_sampling(self, p, *args, **kwargs):
+        """
+        Similar to process(), called before every sampling.
+        If you use high-res fix, this will be called two times.
+        """
+        pass
+
    def process_batch(self, p, *args, **kwargs):
        """
        Same as process(), but called for every batch.
@ -826,6 +833,14 @@ class ScriptRunner:
            except Exception:
                errors.report(f"Error running process: {script.filename}", exc_info=True)

+    def process_before_every_sampling(self, p, **kwargs):
+        for script in self.ordered_scripts('process_before_every_sampling'):
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.process_before_every_sampling(p, *script_args, **kwargs)
+            except Exception:
+                errors.report(f"Error running process_before_every_sampling: {script.filename}", exc_info=True)
+
    def before_process_batch(self, p, **kwargs):
        for script in self.ordered_scripts('before_process_batch'):
            try:
--- a/modules/sd_hijack_clip.py
+++ b/modules/sd_hijack_clip.py
@ -353,7 +353,9 @@ class FrozenCLIPEmbedderForSDXLWithCustomWords(FrozenCLIPEmbedderWithCustomWords
    def encode_with_transformers(self, tokens):
        outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=self.wrapped.layer == "hidden")

-        if self.wrapped.layer == "last":
+        if opts.sdxl_clip_l_skip is True:
+            z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
+        elif self.wrapped.layer == "last":
            z = outputs.last_hidden_state
        else:
            z = outputs.hidden_states[self.wrapped.layer_idx]
--- a/modules/sd_hijack_optimizations.py
+++ b/modules/sd_hijack_optimizations.py
@ -486,7 +486,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):
    k_in = self.to_k(context_k)
    v_in = self.to_v(context_v)

-    q, k, v = (rearrange(t, 'b n (h d) -> b n h d', h=h) for t in (q_in, k_in, v_in))
+    q, k, v = (t.reshape(t.shape[0], t.shape[1], h, -1) for t in (q_in, k_in, v_in))
+
    del q_in, k_in, v_in

    dtype = q.dtype
@ -497,7 +498,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):

    out = out.to(dtype)

-    out = rearrange(out, 'b n h d -> b n (h d)', h=h)
+    b, n, h, d = out.shape
+    out = out.reshape(b, n, h * d)
    return self.to_out(out)


--- a/modules/sd_hijack_unet.py
+++ b/modules/sd_hijack_unet.py
@ -1,5 +1,7 @@
 import torch
 from packaging import version
+from einops import repeat
+import math

 from modules import devices
 from modules.sd_hijack_utils import CondFunc
@ -36,7 +38,7 @@ th = TorchHijackForUnet()

 # Below are monkey patches to enable upcasting a float16 UNet for float32 sampling
 def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
-
+    """Always make sure inputs to unet are in correct dtype."""
    if isinstance(cond, dict):
        for y in cond.keys():
            if isinstance(cond[y], list):
@ -45,7 +47,59 @@ def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
                cond[y] = cond[y].to(devices.dtype_unet) if isinstance(cond[y], torch.Tensor) else cond[y]

    with devices.autocast():
-        return orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs).float()
+        result = orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs)
+        if devices.unet_needs_upcast:
+            return result.float()
+        else:
+            return result
+
+
+# Monkey patch to create timestep embed tensor on device, avoiding a block.
+def timestep_embedding(_, timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half
+        )
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+# Monkey patch to SpatialTransformer removing unnecessary contiguous calls.
+# Prevents a lot of unnecessary aten::copy_ calls
+def spatial_transformer_forward(_, self, x: torch.Tensor, context=None):
+    # note: if no context is given, cross-attention defaults to self-attention
+    if not isinstance(context, list):
+        context = [context]
+    b, c, h, w = x.shape
+    x_in = x
+    x = self.norm(x)
+    if not self.use_linear:
+        x = self.proj_in(x)
+    x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)
+    if self.use_linear:
+        x = self.proj_in(x)
+    for i, block in enumerate(self.transformer_blocks):
+        x = block(x, context=context[i])
+    if self.use_linear:
+        x = self.proj_out(x)
+    x = x.view(b, h, w, c).permute(0, 3, 1, 2)
+    if not self.use_linear:
+        x = self.proj_out(x)
+    return x + x_in


 class GELUHijack(torch.nn.GELU, torch.nn.Module):
@ -64,12 +118,15 @@ def hijack_ddpm_edit():
    if not ddpm_edit_hijack:
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
-        ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+        ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model)


 unet_needs_upcast = lambda *args, **kwargs: devices.unet_needs_upcast
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding)
+CondFunc('ldm.modules.attention.SpatialTransformer.forward', spatial_transformer_forward)
 CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
+
 if version.parse(torch.__version__) <= version.parse("1.13.2") or torch.cuda.is_available():
    CondFunc('ldm.modules.diffusionmodules.util.GroupNorm32.forward', lambda orig_func, self, *args, **kwargs: orig_func(self.float(), *args, **kwargs), unet_needs_upcast)
    CondFunc('ldm.modules.attention.GEGLU.forward', lambda orig_func, self, x: orig_func(self.float(), x.float()).to(devices.dtype_unet), unet_needs_upcast)
@ -81,5 +138,17 @@ CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.decode_first_stage', first_s
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding', lambda orig_func, *args, **kwargs: orig_func(*args, **kwargs).float(), first_stage_cond)

-CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model, unet_needs_upcast)
-CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
+CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model)
+CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model)
+
+
+def timestep_embedding_cast_result(orig_func, timesteps, *args, **kwargs):
+    if devices.unet_needs_upcast and timesteps.dtype == torch.int64:
+        dtype = torch.float32
+    else:
+        dtype = devices.dtype_unet
+    return orig_func(timesteps, *args, **kwargs).to(dtype=dtype)
+
+
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
+CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
--- a/modules/sd_hijack_utils.py
+++ b/modules/sd_hijack_utils.py
@ -1,7 +1,11 @@
 import importlib

+
+always_true_func = lambda *args, **kwargs: True
+
+
 class CondFunc:
-    def __new__(cls, orig_func, sub_func, cond_func):
+    def __new__(cls, orig_func, sub_func, cond_func=always_true_func):
        self = super(CondFunc, cls).__new__(cls)
        if isinstance(orig_func, str):
            func_path = orig_func.split('.')
@ -20,13 +24,13 @@ class CondFunc:
                print(f"Warning: Failed to resolve {orig_func} for CondFunc hijack")
                pass
        self.__init__(orig_func, sub_func, cond_func)
-        return lambda *args, **kwargs: self(*args, **kwargs)
-    def __init__(self, orig_func, sub_func, cond_func):
-        self.__orig_func = orig_func
-        self.__sub_func = sub_func
-        self.__cond_func = cond_func
-    def __call__(self, *args, **kwargs):
-        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
-            return self.__sub_func(self.__orig_func, *args, **kwargs)
-        else:
-            return self.__orig_func(*args, **kwargs)
+        return lambda *args, **kwargs: self(*args, **kwargs)
+    def __init__(self, orig_func, sub_func, cond_func):
+        self.__orig_func = orig_func
+        self.__sub_func = sub_func
+        self.__cond_func = cond_func
+    def __call__(self, *args, **kwargs):
+        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
+            return self.__sub_func(self.__orig_func, *args, **kwargs)
+        else:
+            return self.__orig_func(*args, **kwargs)
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -149,10 +149,12 @@ def list_models():
    cmd_ckpt = shared.cmd_opts.ckpt
    if shared.cmd_opts.no_download_sd_model or cmd_ckpt != shared.sd_model_file or os.path.exists(cmd_ckpt):
        model_url = None
+        expected_sha256 = None
    else:
        model_url = f"{shared.hf_endpoint}/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors"
+        expected_sha256 = '6ce0161689b3853acaa03779ec93eafe75a02f4ced659bee03f50797806fa2fa'

-    model_list = modelloader.load_models(model_path=model_path, model_url=model_url, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt", ".safetensors"], download_name="v1-5-pruned-emaonly.safetensors", ext_blacklist=[".vae.ckpt", ".vae.safetensors"])
+    model_list = modelloader.load_models(model_path=model_path, model_url=model_url, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt", ".safetensors"], download_name="v1-5-pruned-emaonly.safetensors", ext_blacklist=[".vae.ckpt", ".vae.safetensors"], hash_prefix=expected_sha256)

    if os.path.exists(cmd_ckpt):
        checkpoint_info = CheckpointInfo(cmd_ckpt)
@ -280,17 +282,21 @@ def read_metadata_from_safetensors(filename):
        json_start = file.read(2)

        assert metadata_len > 2 and json_start in (b'{"', b"{'"), f"{filename} is not a safetensors file"
-        json_data = json_start + file.read(metadata_len-2)
-        json_obj = json.loads(json_data)

        res = {}
-        for k, v in json_obj.get("__metadata__", {}).items():
-            res[k] = v
-            if isinstance(v, str) and v[0:1] == '{':
-                try:
-                    res[k] = json.loads(v)
-                except Exception:
-                    pass
+
+        try:
+            json_data = json_start + file.read(metadata_len-2)
+            json_obj = json.loads(json_data)
+            for k, v in json_obj.get("__metadata__", {}).items():
+                res[k] = v
+                if isinstance(v, str) and v[0:1] == '{':
+                    try:
+                        res[k] = json.loads(v)
+                    except Exception:
+                        pass
+        except Exception:
+             errors.report(f"Error reading metadata from file: {filename}", exc_info=True)

        return res

@ -395,6 +401,19 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer

    del state_dict

+    # Set is_sdxl_inpaint flag.
+    # Checks Unet structure to detect inpaint model. The inpaint model's
+    # checkpoint state_dict does not contain the key
+    # 'diffusion_model.input_blocks.0.0.weight'.
+    diffusion_model_input = model.model.state_dict().get(
+        'diffusion_model.input_blocks.0.0.weight'
+    )
+    model.is_sdxl_inpaint = (
+        model.is_sdxl and
+        diffusion_model_input is not None and
+        diffusion_model_input.shape[1] == 9
+    )
+
    if shared.cmd_opts.opt_channelslast:
        model.to(memory_format=torch.channels_last)
        timer.record("apply channels_last")
@ -403,6 +422,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
        model.float()
        model.alphas_cumprod_original = model.alphas_cumprod
        devices.dtype_unet = torch.float32
+        assert shared.cmd_opts.precision != "half", "Cannot use --precision half with --no-half"
        timer.record("apply float()")
    else:
        vae = model.first_stage_model
@ -540,7 +560,7 @@ def repair_config(sd_config):
    if hasattr(sd_config.model.params, 'unet_config'):
        if shared.cmd_opts.no_half:
            sd_config.model.params.unet_config.params.use_fp16 = False
-        elif shared.cmd_opts.upcast_sampling:
+        elif shared.cmd_opts.upcast_sampling or shared.cmd_opts.precision == "half":
            sd_config.model.params.unet_config.params.use_fp16 = True

    if getattr(sd_config.model.params.first_stage_config.params.ddconfig, "attn_type", None) == "vanilla-xformers" and not shared.xformers_available:
@ -551,6 +571,14 @@ def repair_config(sd_config):
        karlo_path = os.path.join(paths.models_path, 'karlo')
        sd_config.model.params.noise_aug_config.params.clip_stats_path = sd_config.model.params.noise_aug_config.params.clip_stats_path.replace("checkpoints/karlo_models", karlo_path)

+    # Do not use checkpoint for inference.
+    # This helps prevent extra performance overhead on checking parameters.
+    # The perf overhead is about 100ms/it on 4090 for SDXL.
+    if hasattr(sd_config.model.params, "network_config"):
+        sd_config.model.params.network_config.params.use_checkpoint = False
+    if hasattr(sd_config.model.params, "unet_config"):
+        sd_config.model.params.unet_config.params.use_checkpoint = False
+

 def rescale_zero_terminal_snr_abar(alphas_cumprod):
    alphas_bar_sqrt = alphas_cumprod.sqrt()
@ -659,10 +687,11 @@ def get_empty_cond(sd_model):


 def send_model_to_cpu(m):
-    if m.lowvram:
-        lowvram.send_everything_to_cpu()
-    else:
-        m.to(devices.cpu)
+    if m is not None:
+        if m.lowvram:
+            lowvram.send_everything_to_cpu()
+        else:
+            m.to(devices.cpu)

    devices.torch_gc()

--- a/modules/sd_models_config.py
+++ b/modules/sd_models_config.py
@ -35,7 +35,7 @@ def is_using_v_parameterization_for_sd2(state_dict):

    with sd_disable_initialization.DisableInitialization():
        unet = ldm.modules.diffusionmodules.openaimodel.UNetModel(
-            use_checkpoint=True,
+            use_checkpoint=False,
            use_fp16=False,
            image_size=32,
            in_channels=4,
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@ -35,11 +35,10 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch:


 def apply_model(self: sgm.models.diffusion.DiffusionEngine, x, t, cond):
-    sd = self.model.state_dict()
-    diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
-    if diffusion_model_input is not None:
-        if diffusion_model_input.shape[1] == 9:
-            x = torch.cat([x] + cond['c_concat'], dim=1)
+    """WARNING: This function is called once per denoising iteration. DO NOT add
+    expensive functionc calls such as `model.state_dict`. """
+    if self.is_sdxl_inpaint:
+        x = torch.cat([x] + cond['c_concat'], dim=1)

    return self.model(x, t, cond)

--- a/modules/sd_samplers.py
+++ b/modules/sd_samplers.py
@ -1,7 +1,7 @@
 from __future__ import annotations

 import functools
-
+import logging
 from modules import sd_samplers_kdiffusion, sd_samplers_timesteps, sd_samplers_lcm, shared, sd_samplers_common, sd_schedulers

 # imports for functions that previously were here and are used by other modules
@ -122,4 +122,11 @@ def get_sampler_and_scheduler(sampler_name, scheduler_name):
    return sampler.name, found_scheduler.label


+def fix_p_invalid_sampler_and_scheduler(p):
+    i_sampler_name, i_scheduler = p.sampler_name, p.scheduler
+    p.sampler_name, p.scheduler = get_sampler_and_scheduler(p.sampler_name, p.scheduler)
+    if p.sampler_name != i_sampler_name or i_scheduler != p.scheduler:
+        logging.warning(f'Sampler Scheduler autocorrection: "{i_sampler_name}" -> "{p.sampler_name}", "{i_scheduler}" -> "{p.scheduler}"')
+
+
 set_samplers()
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
@ -1,5 +1,5 @@
 import torch
-from modules import prompt_parser, devices, sd_samplers_common
+from modules import prompt_parser, sd_samplers_common

 from modules.shared import opts, state
 import modules.shared as shared
@ -212,9 +212,16 @@ class CFGDenoiser(torch.nn.Module):
        uncond = denoiser_params.text_uncond
        skip_uncond = False

-        # alternating uncond allows for higher thresholds without the quality loss normally expected from raising it
-        if self.step % 2 and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
+        if shared.opts.skip_early_cond != 0. and self.step / self.total_steps <= shared.opts.skip_early_cond:
            skip_uncond = True
+            self.p.extra_generation_params["Skip Early CFG"] = shared.opts.skip_early_cond
+        elif (self.step % 2 or shared.opts.s_min_uncond_all) and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
+            skip_uncond = True
+            self.p.extra_generation_params["NGMS"] = s_min_uncond
+            if shared.opts.s_min_uncond_all:
+                self.p.extra_generation_params["NGMS all steps"] = shared.opts.s_min_uncond_all
+
+        if skip_uncond:
            x_in = x_in[:-batch_size]
            sigma_in = sigma_in[:-batch_size]

@ -266,8 +273,6 @@ class CFGDenoiser(torch.nn.Module):
        denoised_params = CFGDenoisedParams(x_out, state.sampling_step, state.sampling_steps, self.inner_model)
        cfg_denoised_callback(denoised_params)

-        devices.test_for_nans(x_out, "unet")
-
        if is_edit_model:
            denoised = self.combine_denoised_for_edit_model(x_out, cond_scale)
        elif skip_uncond:
--- a/modules/sd_samplers_kdiffusion.py
+++ b/modules/sd_samplers_kdiffusion.py
@ -1,7 +1,7 @@
 import torch
 import inspect
 import k_diffusion.sampling
-from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers
+from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers, devices
 from modules.sd_samplers_cfg_denoiser import CFGDenoiser  # noqa: F401
 from modules.script_callbacks import ExtraNoiseParams, extra_noise_callback

@ -115,7 +115,7 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
            if scheduler.need_inner_model:
                sigmas_kwargs['inner_model'] = self.model_wrap

-            sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=shared.device)
+            sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=devices.cpu)

        if discard_next_to_last_sigma:
            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
--- a/modules/sd_samplers_timesteps_impl.py
+++ b/modules/sd_samplers_timesteps_impl.py
@ -5,13 +5,14 @@ import numpy as np

 from modules import shared
 from modules.models.diffusion.uni_pc import uni_pc
+from modules.torch_utils import float64


@torch.no_grad()
 def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
    alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
    alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
    sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
    sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))

@ -43,7 +44,7 @@ def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=
 def plms(model, x, timesteps, extra_args=None, callback=None, disable=None):
    alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
    alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
    sqrt_one_minus_alphas = torch.sqrt(1 - alphas)

    extra_args = {} if extra_args is None else extra_args
--- a/modules/sd_schedulers.py
+++ b/modules/sd_schedulers.py
@ -1,8 +1,17 @@
 import dataclasses
-
 import torch
-
 import k_diffusion
+import numpy as np
+
+from modules import shared
+
+
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / sigma
+
+
+k_diffusion.sampling.to_d = to_d


@dataclasses.dataclass
@ -17,7 +26,7 @@ class Scheduler:


 def uniform(n, sigma_min, sigma_max, inner_model, device):
-    return inner_model.get_sigmas(n)
+    return inner_model.get_sigmas(n).to(device)


 def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
@ -31,6 +40,43 @@ def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
    return torch.FloatTensor(sigs).to(device)


+def get_align_your_steps_sigmas(n, sigma_min, sigma_max, device):
+    # https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
+    def loglinear_interp(t_steps, num_steps):
+        """
+        Performs log-linear interpolation of a given array of decreasing numbers.
+        """
+        xs = np.linspace(0, 1, len(t_steps))
+        ys = np.log(t_steps[::-1])
+
+        new_xs = np.linspace(0, 1, num_steps)
+        new_ys = np.interp(new_xs, xs, ys)
+
+        interped_ys = np.exp(new_ys)[::-1].copy()
+        return interped_ys
+
+    if shared.sd_model.is_sdxl:
+        sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.029]
+    else:
+        # Default to SD 1.5 sigmas.
+        sigmas = [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.029]
+
+    if n != len(sigmas):
+        sigmas = np.append(loglinear_interp(sigmas, n), [0.0])
+    else:
+        sigmas.append(0.0)
+
+    return torch.FloatTensor(sigmas).to(device)
+
+
+def kl_optimal(n, sigma_min, sigma_max, device):
+    alpha_min = torch.arctan(torch.tensor(sigma_min, device=device))
+    alpha_max = torch.arctan(torch.tensor(sigma_max, device=device))
+    step_indices = torch.arange(n + 1, device=device)
+    sigmas = torch.tan(step_indices / n * alpha_min + (1.0 - step_indices / n) * alpha_max)
+    return sigmas
+
+
 schedulers = [
    Scheduler('automatic', 'Automatic', None),
    Scheduler('uniform', 'Uniform', uniform, need_inner_model=True),
@ -38,6 +84,8 @@ schedulers = [
    Scheduler('exponential', 'Exponential', k_diffusion.sampling.get_sigmas_exponential),
    Scheduler('polyexponential', 'Polyexponential', k_diffusion.sampling.get_sigmas_polyexponential, default_rho=1.0),
    Scheduler('sgm_uniform', 'SGM Uniform', sgm_uniform, need_inner_model=True, aliases=["SGMUniform"]),
+    Scheduler('kl_optimal', 'KL Optimal', kl_optimal),
+    Scheduler('align_your_steps', 'Align Your Steps', get_align_your_steps_sigmas),
 ]

 schedulers_map = {**{x.name: x for x in schedulers}, **{x.label: x for x in schedulers}}
--- a/modules/shared_init.py
+++ b/modules/shared_init.py
@ -31,6 +31,14 @@ def initialize():
    devices.dtype_vae = torch.float32 if cmd_opts.no_half or cmd_opts.no_half_vae else torch.float16
    devices.dtype_inference = torch.float32 if cmd_opts.precision == 'full' else devices.dtype

+    if cmd_opts.precision == "half":
+        msg = "--no-half and --no-half-vae conflict with --precision half"
+        assert devices.dtype == torch.float16, msg
+        assert devices.dtype_vae == torch.float16, msg
+        assert devices.dtype_inference == torch.float16, msg
+        devices.force_fp16 = True
+        devices.force_model_fp16()
+
    shared.device = devices.device
    shared.weight_load_location = None if cmd_opts.lowram else "cpu"

--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@ -54,7 +54,7 @@ options_templates.update(options_section(('saving-images', "Saving images/grids"
    "save_images_before_color_correction": OptionInfo(False, "Save a copy of image before applying color correction to img2img results"),
    "save_mask": OptionInfo(False, "For inpainting, save a copy of the greyscale mask"),
    "save_mask_composite": OptionInfo(False, "For inpainting, save a masked composite"),
-    "jpeg_quality": OptionInfo(80, "Quality for saved jpeg images", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1}),
+    "jpeg_quality": OptionInfo(80, "Quality for saved jpeg and avif images", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1}),
    "webp_lossless": OptionInfo(False, "Use lossless compression for webp images"),
    "export_for_4chan": OptionInfo(True, "Save copy of large images as JPG").info("if the file size is above the limit, or either width or height are above the limit"),
    "img_downscale_threshold": OptionInfo(4.0, "File size limit for the above option, MB", gr.Number),
@ -129,6 +129,22 @@ options_templates.update(options_section(('system', "System", "system"), {
    "dump_stacks_on_signal": OptionInfo(False, "Print stack traces before exiting the program with ctrl+c."),
 }))

+options_templates.update(options_section(('profiler', "Profiler", "system"), {
+    "profiling_explanation": OptionHTML("""
+Those settings allow you to enable torch profiler when generating pictures.
+Profiling allows you to see which code uses how much of computer's resources during generation.
+Each generation writes its own profile to one file, overwriting previous.
+The file can be viewed in <a href="chrome:tracing">Chrome</a>, or on a <a href="https://ui.perfetto.dev/">Perfetto</a> web site.
+Warning: writing profile can take a lot of time, up to 30 seconds, and the file itelf can be around 500MB in size.
+"""),
+    "profiling_enable": OptionInfo(False, "Enable profiling"),
+    "profiling_activities": OptionInfo(["CPU"], "Activities", gr.CheckboxGroup, {"choices": ["CPU", "CUDA"]}),
+    "profiling_record_shapes": OptionInfo(True, "Record shapes"),
+    "profiling_profile_memory": OptionInfo(True, "Profile memory"),
+    "profiling_with_stack": OptionInfo(True, "Include python stack"),
+    "profiling_filename": OptionInfo("trace.json", "Profile filename"),
+}))
+
 options_templates.update(options_section(('API', "API", "system"), {
    "api_enable_requests": OptionInfo(True, "Allow http:// and https:// URLs for input images in API", restrict_api=True),
    "api_forbid_local_requests": OptionInfo(True, "Forbid URLs to local resources", restrict_api=True),
@ -160,6 +176,7 @@ options_templates.update(options_section(('sd', "Stable Diffusion", "sd"), {
    "emphasis": OptionInfo("Original", "Emphasis mode", gr.Radio, lambda: {"choices": [x.name for x in sd_emphasis.options]}, infotext="Emphasis").info("makes it possible to make model to pay (more:1.1) or (less:0.9) attention to text when you use the syntax in prompt; " + sd_emphasis.get_options_descriptions()),
    "enable_batch_seeds": OptionInfo(True, "Make K-diffusion samplers produce same images in a batch as when making a single image"),
    "comma_padding_backtrack": OptionInfo(20, "Prompt word wrap length limit", gr.Slider, {"minimum": 0, "maximum": 74, "step": 1}).info("in tokens - for texts shorter than specified, if they don't fit into 75 token limit, move them to the next 75 token chunk"),
+    "sdxl_clip_l_skip": OptionInfo(False, "Clip skip SDXL", gr.Checkbox).info("Enable Clip skip for the secondary clip model in sdxl. Has no effect on SD 1.5 or SD 2.0/2.1."),
    "CLIP_stop_at_last_layers": OptionInfo(1, "Clip skip", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}, infotext="Clip skip").link("wiki", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#clip-skip").info("ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer"),
    "upcast_attn": OptionInfo(False, "Upcast cross attention layer to float32"),
    "randn_source": OptionInfo("GPU", "Random number generator source.", gr.Radio, {"choices": ["GPU", "CPU", "NV"]}, infotext="RNG").info("changes seeds drastically; use CPU to produce the same picture across different videocard vendors; use NV to produce same picture as on NVidia videocards"),
@ -209,7 +226,8 @@ options_templates.update(options_section(('img2img', "img2img", "sd"), {

 options_templates.update(options_section(('optimizations', "Optimizations", "sd"), {
    "cross_attention_optimization": OptionInfo("Automatic", "Cross attention optimization", gr.Dropdown, lambda: {"choices": shared_items.cross_attention_optimizations()}),
-    "s_min_uncond": OptionInfo(0.0, "Negative Guidance minimum sigma", gr.Slider, {"minimum": 0.0, "maximum": 15.0, "step": 0.01}).link("PR", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/9177").info("skip negative prompt for some steps when the image is almost ready; 0=disable, higher=faster"),
+    "s_min_uncond": OptionInfo(0.0, "Negative Guidance minimum sigma", gr.Slider, {"minimum": 0.0, "maximum": 15.0, "step": 0.01}, infotext='NGMS').link("PR", "https://github.com/AUTOMATIC1111/stablediffusion-webui/pull/9177").info("skip negative prompt for some steps when the image is almost ready; 0=disable, higher=faster"),
+    "s_min_uncond_all": OptionInfo(False, "Negative Guidance minimum sigma all steps", infotext='NGMS all steps').info("By default, NGMS above skips every other step; this makes it skip all steps"),
    "token_merging_ratio": OptionInfo(0.0, "Token merging ratio", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}, infotext='Token merging ratio').link("PR", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/9256").info("0=disable, higher=faster"),
    "token_merging_ratio_img2img": OptionInfo(0.0, "Token merging ratio for img2img", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}).info("only applies if non-zero and overrides above"),
    "token_merging_ratio_hr": OptionInfo(0.0, "Token merging ratio for high-res pass", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}, infotext='Token merging ratio hr').info("only applies if non-zero and overrides above"),
@ -369,6 +387,7 @@ options_templates.update(options_section(('ui', "Live previews", "ui"), {
    "live_preview_refresh_period": OptionInfo(1000, "Progressbar and preview update period").info("in milliseconds"),
    "live_preview_fast_interrupt": OptionInfo(False, "Return image with chosen live preview method on interrupt").info("makes interrupts faster"),
    "js_live_preview_in_modal_lightbox": OptionInfo(False, "Show Live preview in full page image viewer"),
+    "prevent_screen_sleep_during_generation": OptionInfo(True, "Prevent screen sleep during generation"),
 }))

 options_templates.update(options_section(('sampler-params', "Sampler parameters", "sd"), {
@ -390,7 +409,8 @@ options_templates.update(options_section(('sampler-params', "Sampler parameters"
    'uni_pc_skip_type': OptionInfo("time_uniform", "UniPC skip type", gr.Radio, {"choices": ["time_uniform", "time_quadratic", "logSNR"]}, infotext='UniPC skip type'),
    'uni_pc_order': OptionInfo(3, "UniPC order", gr.Slider, {"minimum": 1, "maximum": 50, "step": 1}, infotext='UniPC order').info("must be < sampling steps"),
    'uni_pc_lower_order_final': OptionInfo(True, "UniPC lower order final", infotext='UniPC lower order final'),
-    'sd_noise_schedule': OptionInfo("Default", "Noise schedule for sampling", gr.Radio, {"choices": ["Default", "Zero Terminal SNR"]}, infotext="Noise Schedule").info("for use with zero terminal SNR trained models")
+    'sd_noise_schedule': OptionInfo("Default", "Noise schedule for sampling", gr.Radio, {"choices": ["Default", "Zero Terminal SNR"]}, infotext="Noise Schedule").info("for use with zero terminal SNR trained models"),
+    'skip_early_cond': OptionInfo(0.0, "Ignore negative prompt during early sampling", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}, infotext="Skip Early CFG").info("disables CFG on a proportion of steps at the beginning of generation; 0=skip none; 1=skip all; can both improve sample diversity/quality and speed up sampling"),
 }))

 options_templates.update(options_section(('postprocessing', "Postprocessing", "postprocessing"), {
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@ -181,12 +181,16 @@ class EmbeddingDatabase:
        else:
            return

-        embedding = create_embedding_from_data(data, name, filename=filename, filepath=path)
+        if data is not None:
+            embedding = create_embedding_from_data(data, name, filename=filename, filepath=path)

-        if self.expected_shape == -1 or self.expected_shape == embedding.shape:
-            self.register_embedding(embedding, shared.sd_model)
+            if self.expected_shape == -1 or self.expected_shape == embedding.shape:
+                self.register_embedding(embedding, shared.sd_model)
+            else:
+                self.skipped_embeddings[name] = embedding
        else:
-            self.skipped_embeddings[name] = embedding
+            print(f"Unable to load Textual inversion embedding due to data issue: '{name}'.")
+

    def load_from_dir(self, embdir):
        if not os.path.isdir(embdir.path):
--- a/modules/torch_utils.py
+++ b/modules/torch_utils.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 import torch.nn
+import torch


 def get_param(model) -> torch.nn.Parameter:
@ -15,3 +16,11 @@ def get_param(model) -> torch.nn.Parameter:
        return param

    raise ValueError(f"No parameters found in model {model!r}")
+
+
+def float64(t: torch.Tensor):
+    """return torch.float64 if device is not mps or xpu, else return torch.float32"""
+    match t.device.type:
+        case 'mps', 'xpu':
+            return torch.float32
+    return torch.float64
--- a/modules/ui.py
+++ b/modules/ui.py
@ -38,9 +38,11 @@ warnings.filterwarnings("default" if opts.show_gradio_deprecation_warnings else
 # this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the browser will not show any UI
 mimetypes.init()
 mimetypes.add_type('application/javascript', '.js')
+mimetypes.add_type('application/javascript', '.mjs')

 # Likewise, add explicit content-type header for certain missing image types
 mimetypes.add_type('image/webp', '.webp')
+mimetypes.add_type('image/avif', '.avif')

 if not cmd_opts.share and not cmd_opts.listen:
    # fix gradio phoning home
@ -566,18 +568,25 @@ def create_ui():
                                init_mask_inpaint = gr.Image(label="Mask", source="upload", interactive=True, type="pil", image_mode="RGBA", elem_id="img_inpaint_mask")

                            with gr.TabItem('Batch', id='batch', elem_id="img2img_batch_tab") as tab_batch:
-                                hidden = '<br>Disabled when launched with --hide-ui-dir-config.' if shared.cmd_opts.hide_ui_dir_config else ''
-                                gr.HTML(
-                                    "<p style='padding-bottom: 1em;' class=\"text-gray-500\">Process images in a directory on the same machine where the server is running." +
-                                    "<br>Use an empty output directory to save pictures normally instead of writing to the output directory." +
-                                    f"<br>Add inpaint batch mask directory to enable inpaint batch processing."
-                                    f"{hidden}</p>"
-                                )
-                                img2img_batch_input_dir = gr.Textbox(label="Input directory", **shared.hide_dirs, elem_id="img2img_batch_input_dir")
-                                img2img_batch_output_dir = gr.Textbox(label="Output directory", **shared.hide_dirs, elem_id="img2img_batch_output_dir")
-                                img2img_batch_inpaint_mask_dir = gr.Textbox(label="Inpaint batch mask directory (required for inpaint batch processing only)", **shared.hide_dirs, elem_id="img2img_batch_inpaint_mask_dir")
+                                with gr.Tabs(elem_id="img2img_batch_source"):
+                                    img2img_batch_source_type = gr.Textbox(visible=False, value="upload")
+                                    with gr.TabItem('Upload', id='batch_upload', elem_id="img2img_batch_upload_tab") as tab_batch_upload:
+                                        img2img_batch_upload = gr.Files(label="Files", interactive=True, elem_id="img2img_batch_upload")
+                                    with gr.TabItem('From directory', id='batch_from_dir', elem_id="img2img_batch_from_dir_tab") as tab_batch_from_dir:
+                                        hidden = '<br>Disabled when launched with --hide-ui-dir-config.' if shared.cmd_opts.hide_ui_dir_config else ''
+                                        gr.HTML(
+                                            "<p style='padding-bottom: 1em;' class=\"text-gray-500\">Process images in a directory on the same machine where the server is running." +
+                                            "<br>Use an empty output directory to save pictures normally instead of writing to the output directory." +
+                                            f"<br>Add inpaint batch mask directory to enable inpaint batch processing."
+                                            f"{hidden}</p>"
+                                        )
+                                        img2img_batch_input_dir = gr.Textbox(label="Input directory", **shared.hide_dirs, elem_id="img2img_batch_input_dir")
+                                        img2img_batch_output_dir = gr.Textbox(label="Output directory", **shared.hide_dirs, elem_id="img2img_batch_output_dir")
+                                        img2img_batch_inpaint_mask_dir = gr.Textbox(label="Inpaint batch mask directory (required for inpaint batch processing only)", **shared.hide_dirs, elem_id="img2img_batch_inpaint_mask_dir")
+                                tab_batch_upload.select(fn=lambda: "upload", inputs=[], outputs=[img2img_batch_source_type])
+                                tab_batch_from_dir.select(fn=lambda: "from dir", inputs=[], outputs=[img2img_batch_source_type])
                                with gr.Accordion("PNG info", open=False):
-                                    img2img_batch_use_png_info = gr.Checkbox(label="Append png info to prompts", **shared.hide_dirs, elem_id="img2img_batch_use_png_info")
+                                    img2img_batch_use_png_info = gr.Checkbox(label="Append png info to prompts", elem_id="img2img_batch_use_png_info")
                                    img2img_batch_png_info_dir = gr.Textbox(label="PNG info directory", **shared.hide_dirs, placeholder="Leave empty to use input directory", elem_id="img2img_batch_png_info_dir")
                                    img2img_batch_png_info_props = gr.CheckboxGroup(["Prompt", "Negative prompt", "Seed", "CFG scale", "Sampler", "Steps", "Model hash"], label="Parameters to take from png info", info="Prompts from png info will be appended to prompts set in ui.")

@ -759,6 +768,8 @@ def create_ui():
                    img2img_batch_use_png_info,
                    img2img_batch_png_info_props,
                    img2img_batch_png_info_dir,
+                    img2img_batch_source_type,
+                    img2img_batch_upload,
                ] + custom_inputs,
                outputs=[
                    output_panel.gallery,
--- a/modules/ui_extensions.py
+++ b/modules/ui_extensions.py
@ -396,15 +396,15 @@ def install_extension_from_url(dirname, url, branch_name=None):
        shutil.rmtree(tmpdir, True)


-def install_extension_from_index(url, hide_tags, sort_column, filter_text):
+def install_extension_from_index(url, selected_tags, showing_type, filtering_type, sort_column, filter_text):
    ext_table, message = install_extension_from_url(None, url)

-    code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+    code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)

    return code, ext_table, message, ''


-def refresh_available_extensions(url, hide_tags, sort_column):
+def refresh_available_extensions(url, selected_tags, showing_type, filtering_type, sort_column):
    global available_extensions

    import urllib.request
@ -413,19 +413,19 @@ def refresh_available_extensions(url, hide_tags, sort_column):

    available_extensions = json.loads(text)

-    code, tags = refresh_available_extensions_from_data(hide_tags, sort_column)
+    code, tags = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column)

    return url, code, gr.CheckboxGroup.update(choices=tags), '', ''


-def refresh_available_extensions_for_tags(hide_tags, sort_column, filter_text):
-    code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+def refresh_available_extensions_for_tags(selected_tags, showing_type, filtering_type, sort_column, filter_text):
+    code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)

    return code, ''


-def search_extensions(filter_text, hide_tags, sort_column):
-    code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+def search_extensions(filter_text, selected_tags, showing_type, filtering_type, sort_column):
+    code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)

    return code, ''

@ -450,13 +450,13 @@ def get_date(info: dict, key):
        return ''


-def refresh_available_extensions_from_data(hide_tags, sort_column, filter_text=""):
+def refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text=""):
    extlist = available_extensions["extensions"]
    installed_extensions = {extension.name for extension in extensions.extensions}
    installed_extension_urls = {normalize_git_url(extension.remote) for extension in extensions.extensions if extension.remote is not None}

    tags = available_extensions.get("tags", {})
-    tags_to_hide = set(hide_tags)
+    selected_tags = set(selected_tags)
    hidden = 0

    code = f"""<!-- {time.time()} -->
@ -489,9 +489,19 @@ def refresh_available_extensions_from_data(hide_tags, sort_column, filter_text="
        existing = get_extension_dirname_from_url(url) in installed_extensions or normalize_git_url(url) in installed_extension_urls
        extension_tags = extension_tags + ["installed"] if existing else extension_tags

-        if any(x for x in extension_tags if x in tags_to_hide):
-            hidden += 1
-            continue
+        if len(selected_tags) > 0:
+            matched_tags = [x for x in extension_tags if x in selected_tags]
+            if filtering_type == 'or':
+                need_hide = len(matched_tags) > 0
+            else:
+                need_hide = len(matched_tags) == len(selected_tags)
+
+            if showing_type == 'show':
+                need_hide = not need_hide
+
+            if need_hide:
+                hidden += 1
+                continue

        if filter_text and filter_text.strip():
            if filter_text.lower() not in html.escape(name).lower() and filter_text.lower() not in html.escape(description).lower():
@ -594,8 +604,12 @@ def create_ui():
                    install_extension_button = gr.Button(elem_id="install_extension_button", visible=False)

                with gr.Row():
-                    hide_tags = gr.CheckboxGroup(value=["ads", "localization", "installed"], label="Hide extensions with tags", choices=["script", "ads", "localization", "installed"])
-                    sort_column = gr.Radio(value="newest first", label="Order", choices=["newest first", "oldest first", "a-z", "z-a", "internal order",'update time', 'create time', "stars"], type="index")
+                    selected_tags = gr.CheckboxGroup(value=["ads", "localization", "installed"], label="Extension tags", choices=["script", "ads", "localization", "installed"], elem_classes=['compact-checkbox-group'])
+                    sort_column = gr.Radio(value="newest first", label="Order", choices=["newest first", "oldest first", "a-z", "z-a", "internal order",'update time', 'create time', "stars"], type="index", elem_classes=['compact-checkbox-group'])
+
+                with gr.Row():
+                    showing_type = gr.Radio(value="hide", label="Showing type", choices=["hide", "show"], elem_classes=['compact-checkbox-group'])
+                    filtering_type = gr.Radio(value="or", label="Filtering type", choices=["or", "and"], elem_classes=['compact-checkbox-group'])

                with gr.Row():
                    search_extensions_text = gr.Text(label="Search", container=False)
@ -605,31 +619,43 @@ def create_ui():

                refresh_available_extensions_button.click(
                    fn=modules.ui.wrap_gradio_call(refresh_available_extensions, extra_outputs=[gr.update(), gr.update(), gr.update(), gr.update()]),
-                    inputs=[available_extensions_index, hide_tags, sort_column],
-                    outputs=[available_extensions_index, available_extensions_table, hide_tags, search_extensions_text, install_result],
+                    inputs=[available_extensions_index, selected_tags, showing_type, filtering_type, sort_column],
+                    outputs=[available_extensions_index, available_extensions_table, selected_tags, search_extensions_text, install_result],
                )

                install_extension_button.click(
                    fn=modules.ui.wrap_gradio_call(install_extension_from_index, extra_outputs=[gr.update(), gr.update()]),
-                    inputs=[extension_to_install, hide_tags, sort_column, search_extensions_text],
+                    inputs=[extension_to_install, selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
                    outputs=[available_extensions_table, extensions_table, install_result],
                )

                search_extensions_text.change(
                    fn=modules.ui.wrap_gradio_call(search_extensions, extra_outputs=[gr.update()]),
-                    inputs=[search_extensions_text, hide_tags, sort_column],
+                    inputs=[search_extensions_text, selected_tags, showing_type, filtering_type, sort_column],
                    outputs=[available_extensions_table, install_result],
                )

-                hide_tags.change(
+                selected_tags.change(
                    fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
-                    inputs=[hide_tags, sort_column, search_extensions_text],
+                    inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
+                    outputs=[available_extensions_table, install_result]
+                )
+
+                showing_type.change(
+                    fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+                    inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
+                    outputs=[available_extensions_table, install_result]
+                )
+
+                filtering_type.change(
+                    fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+                    inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
                    outputs=[available_extensions_table, install_result]
                )

                sort_column.change(
                    fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
-                    inputs=[hide_tags, sort_column, search_extensions_text],
+                    inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
                    outputs=[available_extensions_table, install_result]
                )

--- a/modules/ui_gradio_extensions.py
+++ b/modules/ui_gradio_extensions.py
@ -50,7 +50,7 @@ def reload_javascript():

    def template_response(*args, **kwargs):
        res = shared.GradioTemplateResponseOriginal(*args, **kwargs)
-        res.body = res.body.replace(b'</head>', f'{js}</head>'.encode("utf8"))
+        res.body = res.body.replace(b'</head>', f'{js}<meta name="referrer" content="no-referrer"/></head>'.encode("utf8"))
        res.body = res.body.replace(b'</body>', f'{css}</body>'.encode("utf8"))
        res.init_headers()
        return res
--- a/modules/util.py
+++ b/modules/util.py
@ -208,6 +208,6 @@ Requested path was: {path}
    elif platform.system() == "Darwin":
        subprocess.Popen(["open", path])
    elif "microsoft-standard-WSL2" in platform.uname().release:
-        subprocess.Popen(["wsl-open", path])
+        subprocess.Popen(["explorer.exe", subprocess.check_output(["wslpath", "-w", path])])
    else:
        subprocess.Popen(["xdg-open", path])