FLUX.2-Image

Running on Zero

App Files Files Community

tchung1970 commited on 5 days ago

Commit

add1856

1 Parent(s): 491977c

Revert "Use local text encoders instead of remote service"

Browse files

This reverts commit b07ac298ee4fd266049b29c24fa576fba90e4d0f.

Files changed (1) hide show

app.py +48 -13

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 import io
 import re
 import gradio as gr
@@ -6,11 +8,16 @@ import numpy as np
 import random
 import spaces
 import torch
-from diffusers import Flux2Pipeline
 from PIL import Image
 import base64
 from huggingface_hub import InferenceClient
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -44,11 +51,32 @@ Rules:
 Output only the final instruction in plain text and nothing else."""
-# Load model with local text encoders
 repo_id = "black-forest-labs/FLUX.2-dev"
 pipe = Flux2Pipeline.from_pretrained(
     repo_id,
     torch_dtype=torch.bfloat16
 )
 pipe.to(device)
@@ -131,17 +159,20 @@ def update_dimensions_from_image(image_list):
     return new_width, new_height
 # Updated duration function to match generate_image arguments (including progress)
-def get_duration(prompt, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     num_images = 0 if image_list is None else len(image_list)
     step_duration = 1 + 0.8 * num_images
-    return max(90, num_inference_steps * step_duration + 30)  # Increased for text encoding
 @spaces.GPU(duration=get_duration)
-def generate_image(prompt, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     generator = torch.Generator(device=device).manual_seed(seed)
     pipe_kwargs = {
-        "prompt": prompt,
         "image": image_list,
         "num_inference_steps": num_inference_steps,
         "guidance_scale": guidance_scale,
@@ -149,11 +180,11 @@ def generate_image(prompt, image_list, width, height, num_inference_steps, guida
         "width": width,
         "height": height,
     }
     # Progress bar for the actual generation steps
     if progress:
-        progress(0, desc="Generating image...")
     image = pipe(**pipe_kwargs).images[0]
     return image
@@ -180,10 +211,14 @@ def infer(prompt, aspect_ratio="1:1 (1024x1024)", progress=gr.Progress(track_tqd
     num_inference_steps = 30
     guidance_scale = 4.0
-    # Image Generation (GPU bound - includes text encoding)
-    progress(0.1, desc="Generating image...")
     image = generate_image(
-        prompt,
         None,  # No input images
         width,
         height,

 import os
+import subprocess
+import sys
 import io
 import re
 import gradio as gr
 import random
 import spaces
 import torch
+from diffusers import Flux2Pipeline, Flux2Transformer2DModel
+from diffusers import BitsAndBytesConfig as DiffBitsAndBytesConfig
+import requests
 from PIL import Image
+import json
 import base64
 from huggingface_hub import InferenceClient
+subprocess.check_call([sys.executable, "-m", "pip", "install", "spaces==0.43.0"])
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 Output only the final instruction in plain text and nothing else."""
+def remote_text_encoder(prompts):
+    from gradio_client import Client
+    client = Client("multimodalart/mistral-text-encoder")
+    result = client.predict(
+        prompt=prompts,
+        api_name="/encode_text"
+    )
+    # Load returns a tensor, usually on CPU by default
+    prompt_embeds = torch.load(result[0])
+    return prompt_embeds
+# Load model
 repo_id = "black-forest-labs/FLUX.2-dev"
+dit = Flux2Transformer2DModel.from_pretrained(
+    repo_id,
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16
+)
 pipe = Flux2Pipeline.from_pretrained(
     repo_id,
+    text_encoder=None,
+    transformer=dit,
     torch_dtype=torch.bfloat16
 )
 pipe.to(device)
     return new_width, new_height
 # Updated duration function to match generate_image arguments (including progress)
+def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
     num_images = 0 if image_list is None else len(image_list)
     step_duration = 1 + 0.8 * num_images
+    return max(65, num_inference_steps * step_duration + 10)
 @spaces.GPU(duration=get_duration)
+def generate_image(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
+    # Move embeddings to GPU only when inside the GPU decorated function
+    prompt_embeds = prompt_embeds.to(device)
     generator = torch.Generator(device=device).manual_seed(seed)
     pipe_kwargs = {
+        "prompt_embeds": prompt_embeds,
         "image": image_list,
         "num_inference_steps": num_inference_steps,
         "guidance_scale": guidance_scale,
         "width": width,
         "height": height,
     }
     # Progress bar for the actual generation steps
     if progress:
+        progress(0, desc="Starting generation...")
     image = pipe(**pipe_kwargs).images[0]
     return image
     num_inference_steps = 30
     guidance_scale = 4.0
+    # Text Encoding (Network bound - No GPU needed)
+    progress(0.1, desc="Encoding prompt...")
+    prompt_embeds = remote_text_encoder(prompt)
+    # Image Generation (GPU bound)
+    progress(0.3, desc="Generating image...")
     image = generate_image(
+        prompt_embeds,
         None,  # No input images
         width,
         height,