cloneofsimo
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎contents/inpainting_base_image.png‎
619 KB b/‎contents/inpainting_base_image.png‎
619 KB
diff --git a/‎contents/inpainting_mask.png‎
3.49 KB b/‎contents/inpainting_mask.png‎
3.49 KB
diff --git a/‎contents/lora_pti_inpainting.jpg‎
31.6 KB b/‎contents/lora_pti_inpainting.jpg‎
31.6 KB
diff --git a/‎contents/lora_pti_inpainting_example.jpg‎
371 KB b/‎contents/lora_pti_inpainting_example.jpg‎
371 KB
diff --git a/‎example_loras/and.safetensors‎
11.8 MB b/‎example_loras/and.safetensors‎
11.8 MB
diff --git a/‎example_loras/lora_krk_inpainting.safetensors‎
5.92 MB b/‎example_loras/lora_krk_inpainting.safetensors‎
5.92 MB
diff --git a/‎lora_diffusion/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎lora_diffusion/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lora_diffusion/cli_lora_add.py‎
Lines changed: 2 additions & 48 deletions b/‎lora_diffusion/cli_lora_add.py‎
Lines changed: 2 additions & 48 deletions
diff --git a/‎lora_diffusion/cli_lora_pti.py‎
Lines changed: 81 additions & 6 deletions b/‎lora_diffusion/cli_lora_pti.py‎
Lines changed: 81 additions & 6 deletions
@@ -37,6 +37,7 @@
 - Fine-tune Stable diffusion models twice as fast than dreambooth method, by Low-rank Adaptation
 - Get insanely small end result (1MB ~ 6MB), easy to share and download.
 - Compatible with `diffusers`
+- Support for inpainting
 - Sometimes _even better performance_ than full fine-tuning (but left as future work for extensive comparisons)
 - Merge checkpoints + Build recipes by merging LoRAs together
 - Pipeline to fine-tune CLIP + Unet + token to gain better results.
@@ -50,6 +51,10 @@
 
 # UPDATES & Notes
 
+### 2023/02/06
+
+- Support for training inpainting on LoRA PTI. Use flag `--train-inpainting` with a inpainting stable diffusion base model (see `inpainting_example.sh`).
+
 ### 2023/02/01
 
 - LoRA Joining is now available with `--mode=ljl` flag. Only three parameters are required : `path_to_lora1`, `path_to_lora2`, and `path_to_save`.
 
@@ -2,3 +2,4 @@
 from .dataset import *
 from .utils import *
 from .preprocess_files import *
+from .lora_manager import *
@@ -12,6 +12,7 @@
     collapse_lora,
     monkeypatch_remove_lora,
 )
+from .lora_manager import lora_join
 from .to_ckpt_v2 import convert_to_ckpt
 
 
@@ -20,53 +21,6 @@ def _text_lora_path(path: str) -> str:
     return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
 
 
-def lora_join(lora_safetenors: list):
-    metadatas = [dict(safelora.metadata()) for safelora in lora_safetenors]
-    total_metadata = {}
-    total_tensor = {}
-    total_rank = 0
-    for _metadata in metadatas:
-        rankset = []
-        for k, v in _metadata.items():
-            if k.endswith("rank"):
-                rankset.append(int(v))
-
-        assert len(set(rankset)) == 1, "Rank should be the same per model"
-        total_rank += rankset[0]
-        total_metadata.update(_metadata)
-
-    tensorkeys = set()
-    for safelora in lora_safetenors:
-        tensorkeys.update(safelora.keys())
-
-    for keys in tensorkeys:
-        if keys.startswith("text_encoder") or keys.startswith("unet"):
-            tensorset = [safelora.get_tensor(keys) for safelora in lora_safetenors]
-
-            is_down = keys.endswith("down")
-
-            if is_down:
-                _tensor = torch.cat(tensorset, dim=0)
-                assert _tensor.shape[0] == total_rank
-            else:
-                _tensor = torch.cat(tensorset, dim=1)
-                assert _tensor.shape[1] == total_rank
-
-            total_tensor[keys] = _tensor
-            keys_rank = ":".join(keys.split(":")[:-1]) + ":rank"
-            total_metadata[keys_rank] = str(total_rank)
-
-    for idx, safelora in enumerate(lora_safetenors):
-        tokens = [k for k, v in safelora.metadata().items() if v == "<embed>"]
-        for jdx, token in enumerate(sorted(tokens)):
-            del total_metadata[token]
-            total_tensor[f"<s{idx}-{jdx}>"] = safelora.get_tensor(token)
-            total_metadata[f"<s{idx}-{jdx}>"] = "<embed>"
-            print(f"Embedding {token} replaced to <s{idx}-{jdx}>")
-
-    return total_tensor, total_metadata
-
-
 def add(
     path_1: str,
     path_2: str,
@@ -221,7 +175,7 @@ def add(
         safeloras_1 = safe_open(path_1, framework="pt", device="cpu")
         safeloras_2 = safe_open(path_2, framework="pt", device="cpu")
 
-        total_tensor, total_metadata = lora_join([safeloras_1, safeloras_2])
+        total_tensor, total_metadata, _, _ = lora_join([safeloras_1, safeloras_2])
         save_file(total_tensor, output_path, total_metadata)
 
     else:
 
@@ -168,13 +168,60 @@ def collate_fn(examples):
 
     return train_dataloader
 
+def inpainting_dataloader(train_dataset, train_batch_size, tokenizer, vae, text_encoder):
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+        mask_values = [example["instance_masks"] for example in examples]
+        masked_image_values = [example["instance_masked_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if examples[0].get("class_prompt_ids", None) is not None:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+            mask_values += [example["class_masks"] for example in examples]
+            masked_image_values += [example["class_masked_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values).to(memory_format=torch.contiguous_format).float()
+        mask_values = torch.stack(mask_values).to(memory_format=torch.contiguous_format).float()
+        masked_image_values = torch.stack(masked_image_values).to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "mask_values": mask_values,
+            "masked_image_values": masked_image_values
+        }
+
+        if examples[0].get("mask", None) is not None:
+            batch["mask"] = torch.stack([example["mask"] for example in examples])
+
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+    )
+
+    return train_dataloader
 
 def loss_step(
     batch,
     unet,
     vae,
     text_encoder,
     scheduler,
+    train_inpainting=False,
     t_mutliplier=1.0,
     mixed_precision=False,
     mask_temperature=1.0,
@@ -186,6 +233,16 @@ def loss_step(
     ).latent_dist.sample()
     latents = latents * 0.18215
 
+    if train_inpainting:
+        masked_image_latents = vae.encode(
+            batch["masked_image_values"].to(dtype=weight_dtype).to(unet.device)
+        ).latent_dist.sample()
+        masked_image_latents = masked_image_latents * 0.18215
+        mask = F.interpolate(
+            batch["mask_values"].to(dtype=weight_dtype).to(unet.device),
+            scale_factor=1/8
+        )
+
     noise = torch.randn_like(latents)
     bsz = latents.shape[0]
 
@@ -199,21 +256,26 @@ def loss_step(
 
     noisy_latents = scheduler.add_noise(latents, noise, timesteps)
 
+    if train_inpainting:
+        latent_model_input = torch.cat([noisy_latents, mask, masked_image_latents], dim=1)
+    else:
+        latent_model_input = noisy_latents
+
     if mixed_precision:
         with torch.cuda.amp.autocast():
 
             encoder_hidden_states = text_encoder(
                 batch["input_ids"].to(text_encoder.device)
             )[0]
 
-            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+            model_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
     else:
 
         encoder_hidden_states = text_encoder(
             batch["input_ids"].to(text_encoder.device)
         )[0]
 
-        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+        model_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
 
     if scheduler.config.prediction_type == "epsilon":
         target = noise
@@ -270,6 +332,7 @@ def train_inversion(
     log_wandb: bool = False,
     wandb_log_prompt_cnt: int = 10,
     class_token: str = "person",
+    train_inpainting: bool = False,
     mixed_precision: bool = False,
     clip_ti_decay: bool = True,
 ):
@@ -302,6 +365,7 @@ def train_inversion(
                         vae,
                         text_encoder,
                         scheduler,
+                        train_inpainting=train_inpainting,
                         mixed_precision=mixed_precision,
                     )
                     / accum_iter
@@ -384,7 +448,7 @@ def train_inversion(
                         # open all images in test_image_path
                         images = []
                         for file in os.listdir(test_image_path):
-                            if file.endswith(".png") or file.endswith(".jpg") or file.endswith(".jpeg"):
+                            if file.lower().endswith(".png") or file.lower().endswith(".jpg") or file.lower().endswith(".jpeg"):
                                 images.append(
                                     Image.open(os.path.join(test_image_path, file))
                                 )
@@ -429,6 +493,7 @@ def perform_tuning(
     log_wandb: bool = False,
     wandb_log_prompt_cnt: int = 10,
     class_token: str = "person",
+    train_inpainting: bool = False,
 ):
 
     progress_bar = tqdm(range(num_steps))
@@ -457,6 +522,7 @@ def perform_tuning(
                 vae,
                 text_encoder,
                 scheduler,
+                train_inpainting=train_inpainting,
                 t_mutliplier=0.8,
                 mixed_precision=True,
                 mask_temperature=mask_temperature,
@@ -565,6 +631,7 @@ def train(
     stochastic_attribute: Optional[str] = None,
     perform_inversion: bool = True,
     use_template: Literal[None, "object", "style"] = None,
+    train_inpainting: bool = False,
     placeholder_tokens: str = "",
     placeholder_token_at_data: Optional[str] = None,
     initializer_tokens: Optional[str] = None,
@@ -716,13 +783,19 @@ def train(
         color_jitter=color_jitter,
         use_face_segmentation_condition=use_face_segmentation_condition,
         use_mask_captioned_data=use_mask_captioned_data,
+        train_inpainting=train_inpainting,
     )
 
     train_dataset.blur_amount = 200
 
-    train_dataloader = text2img_dataloader(
-        train_dataset, train_batch_size, tokenizer, vae, text_encoder
-    )
+    if train_inpainting:
+        train_dataloader = inpainting_dataloader(
+            train_dataset, train_batch_size, tokenizer, vae, text_encoder
+        )
+    else:
+        train_dataloader = text2img_dataloader(
+            train_dataset, train_batch_size, tokenizer, vae, text_encoder
+        )
 
     index_no_updates = torch.arange(len(tokenizer)) != -1
 
@@ -776,6 +849,7 @@ def train(
             log_wandb=log_wandb,
             wandb_log_prompt_cnt=wandb_log_prompt_cnt,
             class_token=class_token,
+            train_inpainting=train_inpainting,
             mixed_precision=False,
             tokenizer=tokenizer,
             clip_ti_decay=clip_ti_decay,
@@ -883,6 +957,7 @@ def train(
         log_wandb=log_wandb,
         wandb_log_prompt_cnt=wandb_log_prompt_cnt,
         class_token=class_token,
+        train_inpainting=train_inpainting,
     )