Spaces:

lxzcpro
/

TextEraser

Running

App Files Files Community

lxzcpro commited on 10 days ago

Commit

9de67ae

1 Parent(s): 03bafc0

code clean up

Browse files

Files changed (5) hide show

app.py +6 -9
src/matcher.py +7 -11
src/painter.py +17 -17
src/pipeline.py +11 -24
src/segmenter.py +5 -5

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import numpy as np
 from src.pipeline import ObjectRemovalPipeline
 from src.utils import visualize_mask
-# Initialize pipeline once
 pipeline = ObjectRemovalPipeline()
 def ensure_uint8(image):
@@ -18,7 +18,7 @@ def step1_detect(image, text_query):
     if image is None or not text_query:
         return [], [], "Please upload image and enter text."
-    # Calls the new method in pipeline.py
     candidates, msg = pipeline.get_candidates(image, text_query)
     if not candidates:
@@ -26,11 +26,11 @@ def step1_detect(image, text_query):
     masks = [c['mask'] for c in candidates]
-    # Generate visualization for gallery
     gallery_imgs = []
     for i, mask in enumerate(masks):
         viz = visualize_mask(image, mask)
-        # Label with rank and score if available
         label = f"Option {i+1} (Score: {candidates[i].get('weighted_score', 0):.2f})"
         gallery_imgs.append((ensure_uint8(viz), label))
@@ -45,15 +45,13 @@ def step2_remove(image, masks, selected_idx, prompt, shadow_exp):
     target_mask = masks[selected_idx]
-    # Calls the pipeline method
     result = pipeline.inpaint_selected(image, target_mask, prompt, shadow_expansion=shadow_exp)
     return ensure_uint8(result), "Success!"
-# CSS for cleaner UI
 css = """
 .gradio-container {min-height: 0px !important}
-/* Ensure images in gallery don't get cropped strictly */
 button.gallery-item {object-fit: contain !important}
 """
@@ -70,8 +68,7 @@ with gr.Blocks(title="TextEraser", css=css, theme=gr.themes.Soft()) as demo:
             btn_detect = gr.Button("1. Detect Objects", variant="primary")
         with gr.Column(scale=1):
-            # FIXED: object_fit="contain" prevents cropping
-            # allow_preview=True lets you click to zoom
             gallery = gr.Gallery(
                 label="Candidates (Select One)",
                 columns=2,

 from src.pipeline import ObjectRemovalPipeline
 from src.utils import visualize_mask
 pipeline = ObjectRemovalPipeline()
 def ensure_uint8(image):
     if image is None or not text_query:
         return [], [], "Please upload image and enter text."
     candidates, msg = pipeline.get_candidates(image, text_query)
     if not candidates:
     masks = [c['mask'] for c in candidates]
     gallery_imgs = []
     for i, mask in enumerate(masks):
         viz = visualize_mask(image, mask)
         label = f"Option {i+1} (Score: {candidates[i].get('weighted_score', 0):.2f})"
         gallery_imgs.append((ensure_uint8(viz), label))
     target_mask = masks[selected_idx]
     result = pipeline.inpaint_selected(image, target_mask, prompt, shadow_expansion=shadow_exp)
     return ensure_uint8(result), "Success!"
 css = """
 .gradio-container {min-height: 0px !important}
 button.gallery-item {object-fit: contain !important}
 """
             btn_detect = gr.Button("1. Detect Objects", variant="primary")
         with gr.Column(scale=1):
             gallery = gr.Gallery(
                 label="Candidates (Select One)",
                 columns=2,

src/matcher.py CHANGED Viewed

@@ -7,19 +7,19 @@ from transformers import CLIPProcessor, CLIPModel
 class CLIPMatcher:
     def __init__(self, model_name='openai/clip-vit-large-patch14'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load directly to CPU first
         self.model = CLIPModel.from_pretrained(model_name).to("cpu")
         self.processor = CLIPProcessor.from_pretrained(model_name)
     def get_top_k_segments(self, image, segments, text_query, k=5):
         if not segments: return []
-        # 1. Clean Text
         ignore = ['remove', 'delete', 'erase', 'the', 'a', 'an']
         words = [w for w in text_query.lower().split() if w not in ignore]
         clean_text = " ".join(words) if words else text_query
-        # 2. Crop (CPU)
         pil_image = Image.fromarray(image)
         crops = []
         valid_segments = []
@@ -30,11 +30,11 @@ class CLIPMatcher:
         for seg in segments:
             if 'bbox' not in seg: continue
-            # Safe numpy cast
             bbox = np.array(seg['bbox']).astype(int)
             x1, y1, x2, y2 = bbox
-            # Adaptive Context Padding (30%)
             w_box, h_box = x2 - x1, y2 - y1
             pad_x = int(w_box * 0.3)
             pad_y = int(h_box * 0.3)
@@ -49,7 +49,7 @@ class CLIPMatcher:
         if not crops: return []
-        # 3. Inference (Brief GPU usage)
         try:
             self.model.to(self.device)
             inputs = self.processor(
@@ -58,17 +58,14 @@ class CLIPMatcher:
             with torch.no_grad():
                 outputs = self.model(**inputs)
-                # FIX: Use raw logits for meaningful scores.
-                # (Softmax forces sum=1, concealing bad matches)
                 probs = outputs.logits_per_image.cpu().numpy().flatten()
         except Exception as e:
             print(f"CLIP Error: {e}")
             return []
         finally:
-            # Move back to CPU immediately
             self.model.to("cpu")
-        # 4. Score & Sort
         final_results = []
         for i, score in enumerate(probs):
             seg = valid_segments[i]
@@ -78,7 +75,6 @@ class CLIPMatcher:
                 w, h = seg['bbox'][2]-seg['bbox'][0], seg['bbox'][3]-seg['bbox'][1]
                 area_ratio = (w*h) / total_img_area
-            # Logits are roughly 15-30 range. Add small boost for area.
             weighted_score = float(score) + (area_ratio * 2.0)
             final_results.append({

 class CLIPMatcher:
     def __init__(self, model_name='openai/clip-vit-large-patch14'):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = CLIPModel.from_pretrained(model_name).to("cpu")
         self.processor = CLIPProcessor.from_pretrained(model_name)
     def get_top_k_segments(self, image, segments, text_query, k=5):
         if not segments: return []
         ignore = ['remove', 'delete', 'erase', 'the', 'a', 'an']
         words = [w for w in text_query.lower().split() if w not in ignore]
         clean_text = " ".join(words) if words else text_query
         pil_image = Image.fromarray(image)
         crops = []
         valid_segments = []
         for seg in segments:
             if 'bbox' not in seg: continue
             bbox = np.array(seg['bbox']).astype(int)
             x1, y1, x2, y2 = bbox
             w_box, h_box = x2 - x1, y2 - y1
             pad_x = int(w_box * 0.3)
             pad_y = int(h_box * 0.3)
         if not crops: return []
         try:
             self.model.to(self.device)
             inputs = self.processor(
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 probs = outputs.logits_per_image.cpu().numpy().flatten()
         except Exception as e:
             print(f"CLIP Error: {e}")
             return []
         finally:
             self.model.to("cpu")
         final_results = []
         for i, score in enumerate(probs):
             seg = valid_segments[i]
                 w, h = seg['bbox'][2]-seg['bbox'][0], seg['bbox'][3]-seg['bbox'][1]
                 area_ratio = (w*h) / total_img_area
             weighted_score = float(score) + (area_ratio * 2.0)
             final_results.append({

src/painter.py CHANGED Viewed

@@ -6,7 +6,7 @@ from diffusers import StableDiffusionInpaintPipeline, StableDiffusionXLInpaintPi
 class SDInpainter:
     def __init__(self, model_id="runwayml/stable-diffusion-inpainting"):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Use float16 for GPU to save VRAM and speed up inference
         self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
@@ -18,36 +18,36 @@ class SDInpainter:
     def inpaint(self, image, mask, prompt="background"):
         pil_image = Image.fromarray(image).convert('RGB')
-        # Dilate mask to ensure the object edge is covered
         mask = self._dilate_mask(mask)
         pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L')
-        # 1. Keep aspect ratio, resize ensuring dimensions are multiples of 8
         w, h = pil_image.size
-        factor = 512 / max(w, h) # Scale based on the longest side
         new_w = int(w * factor) - (int(w * factor) % 8)
         new_h = int(h * factor) - (int(h * factor) % 8)
         resized_image = pil_image.resize((new_w, new_h), Image.LANCZOS)
         resized_mask = pil_mask.resize((new_w, new_h), Image.NEAREST)
-        # 2. Inpaint
         output = self.pipe(
             prompt=prompt,
-            negative_prompt="artifacts, low quality, distortion, object", # Add negative prompt for better quality
             image=resized_image,
             mask_image=resized_mask,
             num_inference_steps=30,
             guidance_scale=7.5,
         ).images[0]
-        # 3. Resize back to original resolution
         result = output.resize((w, h), Image.LANCZOS)
         return np.array(result)
     def _dilate_mask(self, mask, kernel_size=9):
-        # Increased kernel size slightly for better blending
         import cv2
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         return cv2.dilate(mask, kernel, iterations=1)
@@ -56,24 +56,24 @@ class SDInpainter:
 class SDXLInpainter:
     def __init__(self, model_id="diffusers/stable-diffusion-xl-1.0-inpainting-0.1"):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Use float16
         self.pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
             model_id,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-            variant="fp16", # Add variant for faster loading if available
             use_safetensors=True
         ).to(self.device)
         if self.device == "cuda":
-            self.pipe.enable_model_cpu_offload() # Saves VRAM effectively
-    def inpaint(self, image, mask, prompt=""): # Default prompt changed to empty
         pil_image = Image.fromarray(image).convert('RGB')
-        # Increase kernel size to 15 or 20 to ensure no edge artifacts remain
         mask = self._dilate_mask(mask, kernel_size=15)
-        # Blur the mask slightly to make the transition smoother
         import cv2
         mask = cv2.GaussianBlur(mask, (21, 21), 0)
@@ -90,7 +90,7 @@ class SDXLInpainter:
         if not prompt or prompt == "background":
             final_prompt = "clean background, empty space, seamless texture, high quality"
-            # Lower guidance scale for background filling to rely more on image context
             guidance_scale = 4.5
         else:
             final_prompt = prompt
@@ -108,8 +108,8 @@ class SDXLInpainter:
             image=resized_image,
             mask_image=resized_mask,
             num_inference_steps=40,
-            guidance_scale=guidance_scale, # Dynamic guidance
-            strength=0.99, # High strength to ensure removal
         ).images[0]
         result = output.resize((w, h), Image.LANCZOS)

 class SDInpainter:
     def __init__(self, model_id="runwayml/stable-diffusion-inpainting"):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
     def inpaint(self, image, mask, prompt="background"):
         pil_image = Image.fromarray(image).convert('RGB')
         mask = self._dilate_mask(mask)
         pil_mask = Image.fromarray((mask * 255).astype(np.uint8)).convert('L')
         w, h = pil_image.size
+        factor = 512 / max(w, h)
         new_w = int(w * factor) - (int(w * factor) % 8)
         new_h = int(h * factor) - (int(h * factor) % 8)
         resized_image = pil_image.resize((new_w, new_h), Image.LANCZOS)
         resized_mask = pil_mask.resize((new_w, new_h), Image.NEAREST)
         output = self.pipe(
             prompt=prompt,
+            negative_prompt="artifacts, low quality, distortion, object",
             image=resized_image,
             mask_image=resized_mask,
             num_inference_steps=30,
             guidance_scale=7.5,
         ).images[0]
         result = output.resize((w, h), Image.LANCZOS)
         return np.array(result)
     def _dilate_mask(self, mask, kernel_size=9):
         import cv2
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         return cv2.dilate(mask, kernel, iterations=1)
 class SDXLInpainter:
     def __init__(self, model_id="diffusers/stable-diffusion-xl-1.0-inpainting-0.1"):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
             model_id,
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+            variant="fp16",
             use_safetensors=True
         ).to(self.device)
         if self.device == "cuda":
+            self.pipe.enable_model_cpu_offload()
+    def inpaint(self, image, mask, prompt=""):
         pil_image = Image.fromarray(image).convert('RGB')
         mask = self._dilate_mask(mask, kernel_size=15)
         import cv2
         mask = cv2.GaussianBlur(mask, (21, 21), 0)
         if not prompt or prompt == "background":
             final_prompt = "clean background, empty space, seamless texture, high quality"
             guidance_scale = 4.5
         else:
             final_prompt = prompt
             image=resized_image,
             mask_image=resized_mask,
             num_inference_steps=40,
+            guidance_scale=guidance_scale,
+            strength=0.99,
         ).images[0]
         result = output.resize((w, h), Image.LANCZOS)

src/pipeline.py CHANGED Viewed

@@ -2,15 +2,13 @@ import numpy as np
 import cv2
 import torch
 import gc
-# Note: We import classes but DO NOT instantiate them globally
 from .segmenter import YOLOWorldDetector, SAM2Predictor
 from .matcher import CLIPMatcher
 from .painter import SDXLInpainter
 class ObjectRemovalPipeline:
     def __init__(self):
-        print("Initializing Pipeline in LOW MEMORY mode...")
-        # No models loaded at startup!
         pass
     def _clear_ram(self):
@@ -19,32 +17,26 @@ class ObjectRemovalPipeline:
         torch.cuda.empty_cache()
     def get_candidates(self, image, text_query):
-        """
-        Step 1: Detect & Segment & Rank
-        Strategy: Load one model at a time, use it, then delete it.
-        """
         candidates = []
         box_candidates = []
-        # --- PHASE 1: YOLO (Detect) ---
-        print("Loading YOLO...")
         detector = YOLOWorldDetector()
         try:
             box_candidates = detector.detect(image, text_query)
         finally:
-            del detector # Delete model immediately
             self._clear_ram()
         if not box_candidates:
             return [], "No objects detected."
-        # --- PHASE 2: SAM2 (Segment) ---
-        print("Loading SAM2...")
         segmenter = SAM2Predictor()
         segments_to_score = []
         try:
             segmenter.set_image(image)
-            # Process top 3 boxes -> up to 9 masks
             for cand in box_candidates[:3]:
                 bbox = cand['bbox']
                 mask_variations = segmenter.predict_from_box(bbox)
@@ -56,14 +48,12 @@ class ObjectRemovalPipeline:
                         'label': f"{cand['label']} (Var {i+1})"
                     })
         finally:
-            # Critical cleanup for SAM2
-            if hasattr(segmenter, 'clear_memory'):
-                segmenter.clear_memory()
             del segmenter
             self._clear_ram()
-        # --- PHASE 3: CLIP (Rank) ---
-        print("Loading CLIP...")
         matcher = CLIPMatcher()
         ranked_candidates = []
         try:
@@ -80,10 +70,8 @@ class ObjectRemovalPipeline:
         return ranked_candidates, f"Found {len(ranked_candidates)} options."
     def inpaint_selected(self, image, selected_mask, inpaint_prompt="", shadow_expansion=0):
-        """
-        Step 2: Inpaint
-        """
-        # Shadow / Edge Logic (CPU ops)
         if shadow_expansion > 0:
             kernel_h = int(shadow_expansion * 1.5)
             kernel_w = int(shadow_expansion * 0.5)
@@ -95,8 +83,7 @@ class ObjectRemovalPipeline:
         result = None
-        # --- PHASE 4: SDXL (Inpaint) ---
-        print("Loading SDXL...")
         inpainter = SDXLInpainter()
         try:
             result = inpainter.inpaint(image, final_mask, prompt=inpaint_prompt)

 import cv2
 import torch
 import gc
 from .segmenter import YOLOWorldDetector, SAM2Predictor
 from .matcher import CLIPMatcher
 from .painter import SDXLInpainter
 class ObjectRemovalPipeline:
     def __init__(self):
         pass
     def _clear_ram(self):
         torch.cuda.empty_cache()
     def get_candidates(self, image, text_query):
         candidates = []
         box_candidates = []
         detector = YOLOWorldDetector()
         try:
             box_candidates = detector.detect(image, text_query)
         finally:
+            del detector
             self._clear_ram()
         if not box_candidates:
             return [], "No objects detected."
         segmenter = SAM2Predictor()
         segments_to_score = []
         try:
             segmenter.set_image(image)
             for cand in box_candidates[:3]:
                 bbox = cand['bbox']
                 mask_variations = segmenter.predict_from_box(bbox)
                         'label': f"{cand['label']} (Var {i+1})"
                     })
         finally:
+            segmenter.clear_memory()
             del segmenter
             self._clear_ram()
         matcher = CLIPMatcher()
         ranked_candidates = []
         try:
         return ranked_candidates, f"Found {len(ranked_candidates)} options."
     def inpaint_selected(self, image, selected_mask, inpaint_prompt="", shadow_expansion=0):
         if shadow_expansion > 0:
             kernel_h = int(shadow_expansion * 1.5)
             kernel_w = int(shadow_expansion * 0.5)
         result = None
         inpainter = SDXLInpainter()
         try:
             result = inpainter.inpaint(image, final_mask, prompt=inpaint_prompt)

src/segmenter.py CHANGED Viewed

@@ -6,7 +6,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 class YOLOWorldDetector:
     def __init__(self, model_name='yolov8s-worldv2.pt'):
-        # Initialize, but manage device carefully
         self.model = YOLO(model_name)
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -16,7 +16,7 @@ class YOLOWorldDetector:
         boxes = []
         try:
-            # FIX: Force CPU for text encoding to prevent RuntimeError
             self.model.to('cpu')
             self.model.set_classes([clean_text])
@@ -37,7 +37,7 @@ class YOLOWorldDetector:
         except Exception as e:
             print(f"YOLO Error: {e}")
         finally:
-            # Always offload after use
             self.model.to('cpu')
         boxes.sort(key=lambda x: x['score'], reverse=True)
@@ -57,7 +57,7 @@ class SAM2Predictor:
     def predict_from_box(self, bbox):
         box_input = np.array(bbox)[None, :]
-        # Multimask = True for variety
         masks, scores, logits = self.predictor.predict(
             point_coords=None,
             point_labels=None,
@@ -68,7 +68,7 @@ class SAM2Predictor:
         return [(m.astype(np.uint8), s) for m, s in sorted_results]
     def clear_memory(self):
-        # Critical for preventing memory leaks
         self.predictor.reset_predictor()
         self.predictor.model.to('cpu')
         del self.predictor

 class YOLOWorldDetector:
     def __init__(self, model_name='yolov8s-worldv2.pt'):
         self.model = YOLO(model_name)
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         boxes = []
         try:
             self.model.to('cpu')
             self.model.set_classes([clean_text])
         except Exception as e:
             print(f"YOLO Error: {e}")
         finally:
             self.model.to('cpu')
         boxes.sort(key=lambda x: x['score'], reverse=True)
     def predict_from_box(self, bbox):
         box_input = np.array(bbox)[None, :]
         masks, scores, logits = self.predictor.predict(
             point_coords=None,
             point_labels=None,
         return [(m.astype(np.uint8), s) for m, s in sorted_results]
     def clear_memory(self):
         self.predictor.reset_predictor()
         self.predictor.model.to('cpu')
         del self.predictor