From bbfc5f4a9781e3491bb21083ab8c251b6e5ec382 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 6 Sep 2025 17:23:42 -0600
Subject: [PATCH 01/34] Basic implementation of request scheduling

---
 src/diffusers/pipelines/pipeline_utils.py     | 35 +++++++++++++++++++
 src/diffusers/schedulers/scheduling_amused.py |  6 ++++
 .../scheduling_consistency_decoder.py         |  6 ++++
 .../scheduling_consistency_models.py          |  6 ++++
 .../scheduling_cosine_dpmsolver_multistep.py  |  6 ++++
 src/diffusers/schedulers/scheduling_ddim.py   |  6 ++++
 .../schedulers/scheduling_ddim_cogvideox.py   |  6 ++++
 .../schedulers/scheduling_ddim_inverse.py     |  6 ++++
 .../schedulers/scheduling_ddim_parallel.py    |  7 ++++
 src/diffusers/schedulers/scheduling_ddpm.py   |  6 ++++
 .../schedulers/scheduling_ddpm_parallel.py    |  6 ++++
 .../schedulers/scheduling_ddpm_wuerstchen.py  |  6 ++++
 .../schedulers/scheduling_deis_multistep.py   |  7 ++++
 .../schedulers/scheduling_dpm_cogvideox.py    |  6 ++++
 .../scheduling_dpmsolver_multistep.py         |  6 ++++
 .../scheduling_dpmsolver_multistep_inverse.py |  7 ++++
 .../schedulers/scheduling_dpmsolver_sde.py    |  6 ++++
 .../scheduling_dpmsolver_singlestep.py        |  6 ++++
 .../scheduling_edm_dpmsolver_multistep.py     |  6 ++++
 .../schedulers/scheduling_edm_euler.py        |  6 ++++
 .../scheduling_euler_ancestral_discrete.py    |  6 ++++
 .../schedulers/scheduling_euler_discrete.py   |  6 ++++
 .../scheduling_flow_match_euler_discrete.py   |  6 ++++
 src/diffusers/schedulers/scheduling_sde_ve.py |  6 ++++
 24 files changed, 176 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 023feae4dd27..08627a172df1 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -68,6 +68,8 @@
 )
 from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card
 from ..utils.torch_utils import empty_device_cache, get_device, is_compiled_module
+import copy
+from types import SimpleNamespace
 
 
 if is_torch_npu_available():
@@ -177,6 +179,39 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
+import copy
+from typing import Optional
+
+class RequestScopedPipeline:
+    def __init__(self, pipeline: "DiffusionPipeline"):
+        self._base = pipeline
+        self.unet = pipeline.unet
+        self.vae = pipeline.vae
+        self.text_encoder = getattr(pipeline, "text_encoder", None)
+        self.components = pipeline.components
+
+    def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs):
+        base_sched = self._base.scheduler
+        if hasattr(base_sched, "clone_for_request"):
+            return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs)
+        return copy.deepcopy(base_sched)
+
+    def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
+
+        local_scheduler = self._make_local_scheduler(num_inference_steps, device=device)
+
+        local_pipe = copy.copy(self._base)
+        local_pipe.scheduler = local_scheduler
+
+        if hasattr(local_pipe, "model_cpu_offload_context"):
+            cm = getattr(local_pipe, "model_cpu_offload_context")
+            if callable(cm):
+                with cm():
+                    return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+
+        return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+
+
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):
     r"""
     Base class for all pipelines.
diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py
index 238b8d869171..ee767380e2f7 100644
--- a/src/diffusers/schedulers/scheduling_amused.py
+++ b/src/diffusers/schedulers/scheduling_amused.py
@@ -7,6 +7,7 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
 from .scheduling_utils import SchedulerMixin
+import copy
 
 
 def gumbel_noise(t, generator=None):
@@ -160,3 +161,8 @@ def add_noise(self, sample, timesteps, generator=None):
         masked_sample[mask_indices] = self.config.mask_token_id
 
         return masked_sample
+
+    def clone_for_request(self, num_inference_steps: int, temperature=(2, 0), device: Union[str, torch.device] = None):
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, temperature=temperature, device=device)
+        return local
diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py
index d7af018b284a..7bf3ec6f4aeb 100644
--- a/src/diffusers/schedulers/scheduling_consistency_decoder.py
+++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -8,6 +8,7 @@
 from ..utils import BaseOutput
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
+import copy
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -109,6 +110,11 @@ def set_timesteps(
         self.c_out = self.c_out.to(device)
         self.c_in = self.c_in.to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     @property
     def init_noise_sigma(self):
         return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]]
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 0f5062258800..271369777301 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -243,6 +243,12 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Modified _convert_to_karras implementation that takes in ramp as argument
     def _convert_to_karras(self, ramp):
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
index 66ed296da8ea..ecda598b8ce3 100644
--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -241,6 +241,12 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         # if a noise sampler is used, reinitialise it
         self.noise_sampler = None
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 5ee0d084f060..9dc1006ee2a1 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -339,6 +339,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
index c19efdc7834d..3e91077b7e50 100644
--- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -302,6 +302,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 49dba840d089..fba349c8fc9f 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -286,6 +286,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 7c3f03a8dbe1..49107c9bca17 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -362,6 +362,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 0fab6d910a82..be6d7ad4880d 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -322,6 +322,12 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index ec741f9ecb7d..571aaf52bccc 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -332,6 +332,12 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
index 71f08277ebd7..126956204880 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -161,6 +161,12 @@ def set_timesteps(
             timesteps = torch.Tensor(timesteps).to(device)
         self.timesteps = timesteps
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index 7d8685ba10c3..13adec66870c 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -317,6 +317,13 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
index f7b63720e107..6de6d07f11c8 100644
--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -303,6 +303,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
         lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
         lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index d07ff8b2007b..407215937fa6 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -457,6 +457,12 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index 9ec958851111..fd886b48eb22 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -329,6 +329,13 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._step_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index eeb06773d977..9bba69be9e49 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -412,6 +412,12 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.noise_sampler = None
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def _second_order_timesteps(self, sigmas, log_sigmas):
         def sigma_fn(_t):
             return np.exp(-_t)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 8663210a6244..9d0bebe13d99 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -407,6 +407,12 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index f1b38aaff56c..105603e01f8d 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -273,6 +273,12 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index dbeff3de5652..20d3be9756dc 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -261,6 +261,12 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 9cdaa2c5e101..5713ffcfdee0 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -318,6 +318,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index f58d918dbfbe..fee2d03e5291 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -449,6 +449,12 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index 1a4f12ddfa53..258e8252f557 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -348,6 +348,12 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index 1bfc08cce5e9..d31c6a9430cb 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -24,6 +24,7 @@
 from ..utils import BaseOutput
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
+import copy
 
 
 @dataclass
@@ -122,6 +123,11 @@ def set_timesteps(
 
         self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
 
+    def clone_for_request(self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None):
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, sampling_eps=sampling_eps, device=device)
+        return local
+
     def set_sigmas(
         self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
     ):

From a308e3ed48185ca0a6a7e5e238011ebe1a0f81ea Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 6 Sep 2025 22:04:27 -0600
Subject: [PATCH 02/34] Basic editing in SD and Flux Pipelines

---
 src/diffusers/pipelines/__init__.py           |  1 +
 src/diffusers/pipelines/flux/pipeline_flux.py | 84 ++++++++++++++-----
 src/diffusers/pipelines/pipeline_utils.py     | 49 +++++++++--
 .../pipeline_stable_diffusion.py              | 83 +++++++++++++-----
 .../pipeline_stable_diffusion_3.py            | 82 +++++++++++++-----
 src/diffusers/schedulers/scheduling_tcd.py    |  6 ++
 src/diffusers/schedulers/scheduling_unclip.py |  6 ++
 .../schedulers/scheduling_unipc_multistep.py  |  7 ++
 .../schedulers/scheduling_vq_diffusion.py     |  6 ++
 9 files changed, 257 insertions(+), 67 deletions(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 25d5d213cf33..df9ecff685a2 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -521,6 +521,7 @@
             DiffusionPipeline,
             ImagePipelineOutput,
             StableDiffusionMixin,
+            RequestScopedPipeline
         )
 
     try:
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 124e611bd018..df1a4062fbea 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import copy
 import numpy as np
 import torch
 from transformers import (
@@ -91,10 +91,18 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-):
+) :
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -111,36 +119,72 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
 
 
 class FluxPipeline(
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 08627a172df1..0a53a189745c 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -21,7 +21,8 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
+from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable
+import copy
 
 import numpy as np
 import PIL.Image
@@ -179,35 +180,65 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-import copy
-from typing import Optional
-
 class RequestScopedPipeline:
-    def __init__(self, pipeline: "DiffusionPipeline"):
+    DEFAULT_MUTABLE_ATTRS = [
+        "_all_hooks",
+        "_offload_device",
+        "_progress_bar_config",
+        "_progress_bar",
+        "_rng_state",
+        "_last_seed",
+    ]
+
+    def __init__(self, pipeline: "DiffusionPipeline", mutable_attrs: Optional[Iterable[str]] = None):
         self._base = pipeline
         self.unet = pipeline.unet
         self.vae = pipeline.vae
         self.text_encoder = getattr(pipeline, "text_encoder", None)
         self.components = pipeline.components
+        self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
 
     def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs):
         base_sched = self._base.scheduler
         if hasattr(base_sched, "clone_for_request"):
-            return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs)
+            try:
+                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs)
+            except Exception as e:
+                logger.debug(f"clone_for_request failed: {e}, falling back to deepcopy()")
         return copy.deepcopy(base_sched)
 
+    def _clone_mutable_attrs(self, base, local):
+        for attr in self._mutable_attrs:
+            if hasattr(base, attr):
+                val = getattr(base, attr)
+                # safe shallow copy for common containers
+                if isinstance(val, dict):
+                    setattr(local, attr, dict(val))
+                elif isinstance(val, (list, tuple, set)):
+                    setattr(local, attr, list(val))
+                else:
+                    try:
+                        setattr(local, attr, copy.copy(val))
+                    except Exception:
+                        setattr(local, attr, val)
+
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
 
         local_scheduler = self._make_local_scheduler(num_inference_steps, device=device)
 
         local_pipe = copy.copy(self._base)
+
         local_pipe.scheduler = local_scheduler
+        self._clone_mutable_attrs(self._base, local_pipe)
 
-        if hasattr(local_pipe, "model_cpu_offload_context"):
-            cm = getattr(local_pipe, "model_cpu_offload_context")
-            if callable(cm):
+        cm = getattr(local_pipe, "model_cpu_offload_context", None)
+        if callable(cm):
+            try:
                 with cm():
                     return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+            except TypeError:
+                with cm:
+                    return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 
         return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index cb97f18efeff..ebc87f30a7f3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import copy
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -101,8 +101,16 @@ def retrieve_timesteps(
     **kwargs,
 ):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -119,36 +127,73 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
+
 
 
 class StableDiffusionPipeline(
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 1618f89a49e3..0ee5ad4bc949 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import copy
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
@@ -95,8 +95,16 @@ def retrieve_timesteps(
     **kwargs,
 ):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -113,36 +121,72 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
 
 
 class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py
index 3fd5c341eca9..01a47bbd52a5 100644
--- a/src/diffusers/schedulers/scheduling_tcd.py
+++ b/src/diffusers/schedulers/scheduling_tcd.py
@@ -521,6 +521,12 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index d78efabfbc57..4b07949ac30f 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -177,6 +177,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
         if prev_timestep is None:
             prev_timestep = t - 1
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index 162a34bd2774..b0bc1d1a8b16 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -429,6 +429,13 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
+
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
+        return local
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py
index 57306301d023..7ab4f151de65 100644
--- a/src/diffusers/schedulers/scheduling_vq_diffusion.py
+++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -197,6 +197,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.log_cumprod_bt = self.log_cumprod_bt.to(device)
         self.log_cumprod_ct = self.log_cumprod_ct.to(device)
 
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        import copy
+        local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local
+
     def step(
         self,
         model_output: torch.Tensor,

From 4799b8eab2461b26ae33848bd24c738d7b402325 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 6 Sep 2025 22:40:53 -0600
Subject: [PATCH 03/34] Small Fix

---
 src/diffusers/pipelines/pipeline_utils.py | 160 ++++++++++++++++++----
 1 file changed, 136 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 0a53a189745c..1bff80bd6bf7 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -188,47 +188,155 @@ class RequestScopedPipeline:
         "_progress_bar",
         "_rng_state",
         "_last_seed",
+        "latents",
     ]
 
-    def __init__(self, pipeline: "DiffusionPipeline", mutable_attrs: Optional[Iterable[str]] = None):
+    def __init__(
+        self,
+        pipeline: Any,
+        mutable_attrs: Optional[Iterable[str]] = None,
+        auto_detect_mutables: bool = True,
+        tensor_numel_threshold: int = 1_000_000,
+    ):
         self._base = pipeline
-        self.unet = pipeline.unet
-        self.vae = pipeline.vae
+        self.unet = getattr(pipeline, "unet", None)
+        self.vae = getattr(pipeline, "vae", None)
         self.text_encoder = getattr(pipeline, "text_encoder", None)
-        self.components = pipeline.components
+        self.components = getattr(pipeline, "components", None)
+
         self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
 
-    def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs):
-        base_sched = self._base.scheduler
+        self._auto_detect_mutables = bool(auto_detect_mutables)
+        self._tensor_numel_threshold = int(tensor_numel_threshold)
+
+        self._auto_detected_attrs: List[str] = []
+
+    def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
+        base_sched = getattr(self._base, "scheduler", None)
+        if base_sched is None:
+            return None
+
         if hasattr(base_sched, "clone_for_request"):
             try:
-                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs)
+                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs)
             except Exception as e:
-                logger.debug(f"clone_for_request failed: {e}, falling back to deepcopy()")
-        return copy.deepcopy(base_sched)
+                logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
+
+        try:
+            return copy.deepcopy(base_sched)
+        except Exception as e:
+            logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
+            return base_sched  
+
+    def _autodetect_mutables(self, max_attrs: int = 40):
+        if not self._auto_detect_mutables:
+            return []
+
+        if self._auto_detected_attrs:
+            return self._auto_detected_attrs
+
+        candidates: List[str] = []
+        seen = set()
+        for name in dir(self._base):
+            if name.startswith("__"):
+                continue
+            if name in self._mutable_attrs:
+                continue
+            if name in ("to", "save_pretrained", "from_pretrained"):
+                continue
+            try:
+                val = getattr(self._base, name)
+            except Exception:
+                continue
+
+            import types
+
+            # skip callables and modules
+            if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)):
+                continue
+
+            # containers -> candidate
+            if isinstance(val, (dict, list, set, tuple, bytearray)):
+                candidates.append(name)
+                seen.add(name)
+            else:
+                # try Tensor detection
+                try:
+                    if isinstance(val, torch.Tensor):
+                        if val.numel() <= self._tensor_numel_threshold:
+                            candidates.append(name)
+                            seen.add(name)
+                        else:
+                            logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}")
+                except Exception:
+                    continue
+
+            if len(candidates) >= max_attrs:
+                break
+
+        self._auto_detected_attrs = candidates
+        logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}")
+        return self._auto_detected_attrs
 
     def _clone_mutable_attrs(self, base, local):
-        for attr in self._mutable_attrs:
-            if hasattr(base, attr):
+        attrs_to_clone = list(self._mutable_attrs)
+        attrs_to_clone.extend(self._autodetect_mutables())
+
+        for attr in attrs_to_clone:
+            if not hasattr(base, attr):
+                continue
+            try:
                 val = getattr(base, attr)
-                # safe shallow copy for common containers
-                if isinstance(val, dict):
+            except Exception:
+                continue
+
+            # shallow copy for common containers
+            if isinstance(val, dict):
+                try:
                     setattr(local, attr, dict(val))
-                elif isinstance(val, (list, tuple, set)):
+                except Exception:
+                    setattr(local, attr, val)
+            elif isinstance(val, (list, tuple, set)):
+                try:
                     setattr(local, attr, list(val))
-                else:
-                    try:
-                        setattr(local, attr, copy.copy(val))
-                    except Exception:
-                        setattr(local, attr, val)
+                except Exception:
+                    setattr(local, attr, val)
+            elif isinstance(val, bytearray):
+                try:
+                    setattr(local, attr, bytearray(val))
+                except Exception:
+                    setattr(local, attr, val)
+            else:
+                try:
+                    if isinstance(val, torch.Tensor):
+                        if val.numel() <= self._tensor_numel_threshold:
+                            setattr(local, attr, val.clone())
+                        else:
+                            setattr(local, attr, val)
+                    else:
+                        try:
+                            setattr(local, attr, copy.copy(val))
+                        except Exception:
+                            setattr(local, attr, val)
+                except Exception:
+                    setattr(local, attr, val)
+
 
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
+        local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
 
-        local_scheduler = self._make_local_scheduler(num_inference_steps, device=device)
+        try:
+            local_pipe = copy.copy(self._base)
+        except Exception as e:
+            logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
+            local_pipe = copy.deepcopy(self._base)
+        if local_scheduler is not None:
+            try:
+                setattr(local_pipe, "scheduler", local_scheduler)
+            except Exception:
+                logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
 
-        local_pipe = copy.copy(self._base)
 
-        local_pipe.scheduler = local_scheduler
         self._clone_mutable_attrs(self._base, local_pipe)
 
         cm = getattr(local_pipe, "model_cpu_offload_context", None)
@@ -237,8 +345,12 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                 with cm():
                     return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
             except TypeError:
-                with cm:
-                    return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                # puede ser que cm sea un context manager ya instanciado en vez de callable
+                try:
+                    with cm:
+                        return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                except Exception as e:
+                    logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
 
         return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 

From eda58477cd3882ebb48b114c90851007556c10b0 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 6 Sep 2025 22:50:41 -0600
Subject: [PATCH 04/34] Fix

---
 src/diffusers/pipelines/pipeline_utils.py | 56 +++++++++++++++--------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 1bff80bd6bf7..e2b2d6e84ced 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -278,48 +278,68 @@ def _autodetect_mutables(self, max_attrs: int = 40):
         logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}")
         return self._auto_detected_attrs
 
+    def _is_readonly_property(self, base_obj, attr_name: str) -> bool:
+        try:
+            cls = type(base_obj)
+            descriptor = getattr(cls, attr_name, None)
+            if isinstance(descriptor, property):
+                return descriptor.fset is None
+            if hasattr(descriptor, "__set__") is False and descriptor is not None:
+                return False
+        except Exception:
+            pass
+        return False
+
     def _clone_mutable_attrs(self, base, local):
         attrs_to_clone = list(self._mutable_attrs)
         attrs_to_clone.extend(self._autodetect_mutables())
 
+        EXCLUDE_ATTRS = {"components",}  # añade más si encuentras otros problemáticos
+
         for attr in attrs_to_clone:
+            if attr in EXCLUDE_ATTRS:
+                logger.debug(f"Skipping excluded attr '{attr}'")
+                continue
             if not hasattr(base, attr):
                 continue
+            if self._is_readonly_property(base, attr):
+                logger.debug(f"Skipping read-only property '{attr}'")
+                continue
+
             try:
                 val = getattr(base, attr)
-            except Exception:
+            except Exception as e:
+                logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}")
                 continue
 
-            # shallow copy for common containers
-            if isinstance(val, dict):
-                try:
+            try:
+                if isinstance(val, dict):
                     setattr(local, attr, dict(val))
-                except Exception:
-                    setattr(local, attr, val)
-            elif isinstance(val, (list, tuple, set)):
-                try:
+                elif isinstance(val, (list, tuple, set)):
                     setattr(local, attr, list(val))
-                except Exception:
-                    setattr(local, attr, val)
-            elif isinstance(val, bytearray):
-                try:
+                elif isinstance(val, bytearray):
                     setattr(local, attr, bytearray(val))
-                except Exception:
-                    setattr(local, attr, val)
-            else:
-                try:
+                else:
+                    # small tensors or atomic values
                     if isinstance(val, torch.Tensor):
                         if val.numel() <= self._tensor_numel_threshold:
                             setattr(local, attr, val.clone())
                         else:
+                            # don't clone big tensors, keep reference
                             setattr(local, attr, val)
                     else:
                         try:
                             setattr(local, attr, copy.copy(val))
                         except Exception:
+                            # último recurso: asignar referencia
                             setattr(local, attr, val)
-                except Exception:
-                    setattr(local, attr, val)
+            except (AttributeError, TypeError) as e:
+                logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}")
+                # continuar sin fallar
+                continue
+            except Exception as e:
+                logger.debug(f"Unexpected error cloning attribute '{attr}': {e}")
+                continue
 
 
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):

From 6b5e6be114637340ca25effea9bfd0022e0f0ffd Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 7 Sep 2025 11:16:58 -0600
Subject: [PATCH 05/34] Update for more pipelines

---
 .../pipeline_stable_diffusion_xl.py           | 81 ++++++++++++++-----
 .../pipeline_stable_diffusion_adapter.py      | 80 +++++++++++++-----
 .../pipeline_stable_diffusion_xl_adapter.py   | 80 +++++++++++++-----
 3 files changed, 187 insertions(+), 54 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index b97cf6f1f6f8..81f1580fce4a 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -120,10 +120,18 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-):
+) :
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -140,36 +148,73 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    import copy
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
 
 
 class StableDiffusionXLPipeline(
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index 1ce6987114a7..63f40497afff 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -136,10 +136,18 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-):
+) :
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -156,36 +164,72 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
 
 
 class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 2802d690f3cc..74a1a0bb1b22 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -161,10 +161,18 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-):
+) :
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -181,36 +189,72 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
     Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
     """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = _accepts("timesteps")
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = _accepts("sigmas")
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
     else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
 
 
 class StableDiffusionXLAdapterPipeline(

From df2933f727f2477690b443ecb6573f33502a7923 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 7 Sep 2025 12:37:56 -0600
Subject: [PATCH 06/34] Add examples/server-async

---
 .../server-async/DiffusersServer/Pipelines.py | 123 ++++++++
 .../server-async/DiffusersServer/__init__.py  |   3 +
 .../DiffusersServer/create_server.py          |  45 +++
 .../DiffusersServer/serverasync.py            | 290 ++++++++++++++++++
 .../DiffusersServer/superpipeline.py          |  44 +++
 .../DiffusersServer/uvicorn_diffu.py          |  88 ++++++
 examples/server-async/README.md               | 118 +++++++
 examples/server-async/requirements.txt        |   6 +
 examples/server-async/server.py               |  11 +
 examples/server-async/test.py                 |  60 ++++
 10 files changed, 788 insertions(+)
 create mode 100644 examples/server-async/DiffusersServer/Pipelines.py
 create mode 100644 examples/server-async/DiffusersServer/__init__.py
 create mode 100644 examples/server-async/DiffusersServer/create_server.py
 create mode 100644 examples/server-async/DiffusersServer/serverasync.py
 create mode 100644 examples/server-async/DiffusersServer/superpipeline.py
 create mode 100644 examples/server-async/DiffusersServer/uvicorn_diffu.py
 create mode 100644 examples/server-async/README.md
 create mode 100644 examples/server-async/requirements.txt
 create mode 100644 examples/server-async/server.py
 create mode 100644 examples/server-async/test.py

diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py
new file mode 100644
index 000000000000..648f708fd562
--- /dev/null
+++ b/examples/server-async/DiffusersServer/Pipelines.py
@@ -0,0 +1,123 @@
+# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/Pipelines.py
+
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
+from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
+import torch
+import os
+import logging
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+class TextToImageInput(BaseModel):
+    model: str
+    prompt: str
+    size: str | None = None
+    n: int | None = None
+
+class TextToImagePipelineSD3:
+    def __init__(self, model_path: str | None = None):
+        """
+        Inicialización de la clase con la ruta del modelo.
+        Si no se proporciona, se obtiene de la variable de entorno.
+        """
+        self.model_path = model_path or os.getenv("MODEL_PATH")
+        self.pipeline: StableDiffusion3Pipeline = None
+        self.device: str = None
+
+    def start(self):
+        """
+        Inicia el pipeline cargando el modelo en CUDA o MPS según esté disponible.
+        Se utiliza la ruta del modelo definida en el __init__ y se asigna un valor predeterminado
+        en función del dispositivo disponible si no se definió previamente.
+        """
+        if torch.cuda.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
+            model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
+            logger.info("Loading CUDA")
+            self.device = "cuda"
+            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+            ).to(device=self.device)
+        elif torch.backends.mps.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para MPS.
+            model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
+            logger.info("Loading MPS for Mac M Series")
+            self.device = "mps"
+            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+            ).to(device=self.device)
+        else:
+            raise Exception("No CUDA or MPS device available")
+
+class TextToImagePipelineFlux:
+    def __init__(self, model_path: str | None = None, low_vram: bool = False):
+        """
+        Inicialización de la clase con la ruta del modelo.
+        Si no se proporciona, se obtiene de la variable de entorno.
+        """
+        self.model_path = model_path or os.getenv("MODEL_PATH")
+        self.pipeline: FluxPipeline = None
+        self.device: str = None
+        self.low_vram = low_vram
+
+    def start(self):
+        if torch.cuda.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
+            model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
+            logger.info("Loading CUDA")
+            self.device = "cuda" 
+            self.pipeline = FluxPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+            ).to(device=self.device)
+            if self.low_vram:
+                self.pipeline.enable_model_cpu_offload()
+            else:
+                pass
+        elif torch.backends.mps.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para MPS.
+            model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
+            logger.info("Loading MPS for Mac M Series")
+            self.device = "mps"
+            self.pipeline = FluxPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+            ).to(device=self.device)
+        else:
+            raise Exception("No CUDA or MPS device available")
+
+class TextToImagePipelineSD:
+    def __init__(self, model_path: str | None = None):
+        """
+        Inicialización de la clase con la ruta del modelo.
+        Si no se proporciona, se obtiene de la variable de entorno.
+        """
+        self.model_path = model_path or os.getenv("MODEL_PATH")
+        self.pipeline: StableDiffusionPipeline = None
+        self.device: str = None
+
+    def start(self):
+        if torch.cuda.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
+            model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
+            logger.info("Loading CUDA")
+            self.device = "cuda" 
+            self.pipeline = StableDiffusionPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+            ).to(device=self.device)
+        elif torch.backends.mps.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para MPS.
+            model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
+            logger.info("Loading MPS for Mac M Series")
+            self.device = "mps"
+            self.pipeline = StableDiffusionPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+            ).to(device=self.device)
+        else:
+            raise Exception("No CUDA or MPS device available")
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py
new file mode 100644
index 000000000000..d4dc75b71a1f
--- /dev/null
+++ b/examples/server-async/DiffusersServer/__init__.py
@@ -0,0 +1,3 @@
+from .Pipelines import TextToImagePipelineSD3
+from .superpipeline import SuperPipelinesT2Img
+from .create_server import create_inference_server_Async as DiffusersServerApp
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py
new file mode 100644
index 000000000000..a5e6357db9d7
--- /dev/null
+++ b/examples/server-async/DiffusersServer/create_server.py
@@ -0,0 +1,45 @@
+# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/create_server.py
+
+from .Pipelines import *
+from .serverasync import *
+from .uvicorn_diffu import *
+import asyncio
+
+def create_inference_server_Async(
+    model:str,
+    type_model: str = 't2im',
+    host: str = '0.0.0.0',
+    port: int = 8500,
+    threads=5,
+    enable_memory_monitor=True,
+    custom_model: bool = False,
+    custom_pipeline: Optional[Type] | None = None,
+    constructor_pipeline: Optional[Type] | None = None,
+    components: Optional[Dict[str, Any]] = None,
+    api_name: Optional[str] = 'custom_api',
+    torch_dtype = torch.bfloat16
+):
+    config = ServerConfigModels(
+        model=model,
+        type_models=type_model,
+        custom_model=custom_model,
+        custom_pipeline=custom_pipeline,
+        constructor_pipeline=constructor_pipeline,
+        components=components,
+        api_name=api_name,
+        torch_dtype=torch_dtype,
+        host=host,
+        port=port
+    )
+
+    app = create_app_fastapi(config)
+
+    asyncio.run(run_uvicorn_server(
+        app, 
+        host=host, 
+        port=port, 
+        workers=threads,
+        enable_memory_monitor=enable_memory_monitor
+    ))
+
+    return app
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
new file mode 100644
index 000000000000..303f1aa31b3f
--- /dev/null
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -0,0 +1,290 @@
+# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/serverasync.py
+
+from fastapi import FastAPI, HTTPException, status
+from fastapi.responses import FileResponse  
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.concurrency import run_in_threadpool
+from pydantic import BaseModel
+from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD
+import logging
+from diffusers.utils.export_utils import export_to_video
+from diffusers.pipelines.pipeline_utils import RequestScopedPipeline
+from diffusers import *
+from .superpipeline import *
+import random
+import uuid
+import tempfile
+from dataclasses import dataclass
+import os
+import torch
+import threading
+import gc
+from typing import Optional, Dict, Any, Type
+from dataclasses import dataclass, field
+from typing import List
+
+@dataclass
+class PresetModels:
+    SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
+    SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium'])
+    Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell'])
+
+class ModelPipelineInitializer:
+    def __init__(self, model: str = '', type_models: str = 't2im'):
+        self.model = model
+        self.type_models = type_models
+        self.pipeline = None
+        self.device = "cuda" if torch.cuda.is_available() else "mps"
+        self.model_type = None
+
+    def initialize_pipeline(self):
+        if not self.model:
+            raise ValueError("Model name not provided")
+
+        # Check if model exists in PresetModels
+        preset_models = PresetModels()
+
+        # Determine which model type we're dealing with
+        if self.model in preset_models.SD3:
+            self.model_type = "SD3"
+        elif self.model in preset_models.SD3_5:
+            self.model_type = "SD3_5"
+        elif self.model in preset_models.Flux:
+            self.model_type = "Flux"
+        else:
+            self.model_type = "SD"
+
+        # Create appropriate pipeline based on model type and type_models
+        if self.type_models == 't2im':
+            if self.model_type in ["SD3", "SD3_5"]:
+                self.pipeline = TextToImagePipelineSD3(self.model)
+            elif self.model_type == "Flux":
+                self.pipeline = TextToImagePipelineFlux(self.model)
+            elif self.model_type == "SD":
+                self.pipeline = TextToImagePipelineSD(self.model)
+            else:
+                raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
+        elif self.type_models == 't2v':
+            raise ValueError(f"Unsupported type_models: {self.type_models}")
+
+        return self.pipeline
+
+class Utils:
+    def __init__(self, host: str = '0.0.0.0', port: int = 8500):
+        self.service_url = f"http://{host}:{port}"
+        self.image_dir = os.path.join(tempfile.gettempdir(), "images")
+        if not os.path.exists(self.image_dir):
+            os.makedirs(self.image_dir)
+
+        self.video_dir = os.path.join(tempfile.gettempdir(), "videos")
+        if not os.path.exists(self.video_dir):
+            os.makedirs(self.video_dir)
+
+    def save_image(self, image):
+        if hasattr(image, "to"):
+            try:
+                image = image.to("cpu")
+            except Exception:
+                pass
+
+        if isinstance(image, torch.Tensor):
+            from torchvision import transforms
+            to_pil = transforms.ToPILImage()
+            image = to_pil(image.squeeze(0).clamp(0, 1))
+
+        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
+        image_path = os.path.join(self.image_dir, filename)
+        logger.info(f"Saving image to {image_path}")
+
+        image.save(image_path, format="PNG", optimize=True)
+
+        del image
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        return os.path.join(self.service_url, "images", filename)
+
+    def save_video(self, video, fps):
+        filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4"
+        video_path = os.path.join(self.video_dir, filename)
+        export = export_to_video(video, video_path, fps=fps)
+        logger.info(f"Saving video to {video_path}")
+        return os.path.join(self.service_url, "video", filename)
+
+@dataclass
+class ServerConfigModels:
+    model: str = 'stabilityai/stable-diffusion-3-medium' 
+    type_models: str = 't2im' 
+    custom_model : bool = False
+    constructor_pipeline: Optional[Type] = None
+    custom_pipeline: Optional[Type] = None  
+    components: Optional[Dict[str, Any]] = None
+    api_name: Optional[str] = 'custom_api'
+    torch_dtype: Optional[torch.dtype] = None
+    host: str = '0.0.0.0' 
+    port: int = 8500
+
+def create_app_fastapi(config: ServerConfigModels) -> FastAPI:
+    app = FastAPI()
+
+    class JSONBodyQueryAPI(BaseModel):
+        model : str | None = None
+        prompt : str
+        negative_prompt : str | None = None
+        num_inference_steps : int = 28
+        num_images_per_prompt : int = 1
+
+    logging.basicConfig(level=logging.INFO)
+    global logger
+    logger = logging.getLogger(__name__)
+
+    server_config = config or ServerConfigModels()
+    app.state.SERVER_CONFIG = server_config
+
+    global utils_app
+
+    utils_app = Utils(host=server_config.host, port=server_config.port)
+
+    logger.info(f"Inicializando pipeline para el modelo: {server_config.model}")
+    try:
+        if server_config.custom_model:
+            if server_config.constructor_pipeline is None:
+                raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required")
+            initializer = server_config.constructor_pipeline(
+                model_path=server_config.model,
+                pipeline=server_config.custom_pipeline,
+                torch_dtype=server_config.torch_dtype,
+                components=server_config.components,
+            )
+            model_pipeline = initializer.start()
+            app.state.CUSTOM_PIPELINE = server_config.custom_pipeline
+            app.state.MODEL_PIPELINE = model_pipeline
+            app.state.MODEL_INITIALIZER = initializer
+            logger.info(f"Pipeline personalizado inicializado. Tipo: {type(model_pipeline)}")
+        else:
+            initializer = ModelPipelineInitializer(
+                model=server_config.model,
+                type_models=server_config.type_models,
+            )
+            model_pipeline = initializer.initialize_pipeline()
+            model_pipeline.start()
+
+            app.state.REQUEST_PIPE = RequestScopedPipeline(model_pipeline.pipeline)
+
+            # Lock for concurrency
+            pipeline_lock = threading.Lock()
+
+            app.state.MODEL_PIPELINE = model_pipeline
+            app.state.PIPELINE_LOCK = pipeline_lock
+            app.state.MODEL_INITIALIZER = initializer
+
+        logger.info("Pipeline initialized and ready to receive requests")
+    except Exception as e:
+        logger.error(f"Error initializing pipeline: {e}")
+        raise
+
+
+    @app.get("/")
+    async def root():
+        return {"message": "Welcome to the Diffusers Server"}
+
+    @app.post("/api/diffusers/inference")
+    async def api(json: JSONBodyQueryAPI):
+        prompt                = json.prompt
+        negative_prompt       = json.negative_prompt or ""
+        num_steps             = json.num_inference_steps
+        num_images_per_prompt = json.num_images_per_prompt
+
+        wrapper     = app.state.MODEL_PIPELINE
+        initializer = app.state.MODEL_INITIALIZER
+
+
+        if not wrapper or not wrapper.pipeline:
+            raise HTTPException(500, "Modelo no inicializado correctamente")
+        if not prompt.strip():
+            raise HTTPException(400, "No se proporcionó prompt")
+
+        def make_generator():
+            g = torch.Generator(device=initializer.device)
+            return g.manual_seed(random.randint(0, 10_000_000))
+
+        req_pipe = app.state.REQUEST_PIPE
+
+        def infer():
+            # This is called that because the RequestScoped Pipeline already internally 
+            # handles everything necessary for inference and only the 
+            # model pipeline needs to be passed, for example StableDiffusion3Pipeline
+            gen = make_generator()
+            return req_pipe.generate(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                generator=gen,
+                num_inference_steps=num_steps,
+                num_images_per_prompt=num_images_per_prompt,
+                device=initializer.device
+            )
+
+        try:
+            output = await run_in_threadpool(infer)
+
+            urls = [utils_app.save_image(img) for img in output.images]
+            return {"response": urls}
+
+        except Exception as e:
+            logger.error(f"Error durante la inferencia: {e}")
+            raise HTTPException(500, f"Error en procesamiento: {e}")
+
+        finally:
+            import gc; gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+
+    @app.get("/images/{filename}")
+    async def serve_image(filename: str):
+        file_path = os.path.join(utils_app.image_dir, filename)
+        if not os.path.isfile(file_path):
+            raise HTTPException(status_code=404, detail="Image not found")
+        return FileResponse(file_path, media_type="image/png")
+
+    @app.get("/api/models")
+    async def list_models():
+        return {
+            "current_model" : server_config.model,
+            "type" : server_config.type_models,
+            "all_models": {
+                "type": "T2Img",
+                "SD3": PresetModels().SD3,
+                "SD3_5": PresetModels().SD3_5,
+                "Flux": PresetModels().Flux,
+            }
+        }
+
+    @app.get("/api/status")
+    async def get_status():
+        memory_info = {}
+        if torch.cuda.is_available():
+            memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+            memory_reserved = torch.cuda.memory_reserved() / 1024**3    # GB
+            memory_info = {
+                "memory_allocated_gb": round(memory_allocated, 2),
+                "memory_reserved_gb": round(memory_reserved, 2),
+                "device": torch.cuda.get_device_name(0)
+            }
+
+        return {
+            "current_model" : server_config.model,
+            "type_models" : server_config.type_models,
+            "memory" : memory_info}
+        
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"], 
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    return app
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py
new file mode 100644
index 000000000000..394ebac39011
--- /dev/null
+++ b/examples/server-async/DiffusersServer/superpipeline.py
@@ -0,0 +1,44 @@
+# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/superpipeline.py
+
+from diffusers.pipelines import *
+from diffusers  import *
+import torch
+from typing import Optional, Dict, Any, Type
+import logging
+
+logger = logging.getLogger(__name__)
+
+class SuperPipelinesT2Img:
+    def __init__(self, model_path: str, 
+                pipeline: Type, 
+                torch_dtype = torch.bfloat16, 
+                components: Optional[Dict[str, Any]] = None,):
+        self.model_path = model_path
+        self.pipeline = pipeline
+        self.torch_dtype = torch_dtype
+        self.components = components or {}
+        self.device: str = None
+    
+    def start(self):
+        if torch.cuda.is_available():
+            logger.info("Loading CUDA")
+            model_path = self.model_path
+            self.device = 'cuda'
+            self.pipeline = self.pipeline.from_pretrained(
+                    model_path,
+                    torch_dtype = self.torch_dtype,
+                    ** self.components
+                ).to(device=self.device)
+        elif torch.backends.mps.is_available():
+            logger.info("Loading MPS for Mac M Series")
+            model_path = self.model_path
+            self.device = 'mps'
+            self.pipeline = self.pipeline.from_pretrained(
+                    model_path,
+                    torch_dtype = self.torch_dtype,
+                    **self.components
+                ).to(device=self.device)
+        else:
+            raise Exception("No CUDA or MPS device available")
+        
+        return self
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py
new file mode 100644
index 000000000000..7e19b50f3cbe
--- /dev/null
+++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py
@@ -0,0 +1,88 @@
+# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/uvicorn_diffu.py
+
+import uvicorn
+import logging
+import gc
+import psutil
+import os
+import threading
+import time
+import string
+
+def setup_logging():
+    logging.basicConfig(level=logging.INFO)
+    return logging.getLogger('uvicorn')
+
+logger = setup_logging()
+
+def memory_cleanup(interval=30):
+    while True:
+        try:
+            
+            gc.collect()
+            
+
+            process = psutil.Process(os.getpid())
+            mem = process.memory_info().rss / 1024 / 1024
+            logger.info(f"Memoria en uso: {mem:.2f} MB")
+            
+            time.sleep(interval)
+        except Exception as e:
+            logger.error(f"Error en limpieza de memoria: {str(e)}")
+            time.sleep(interval)
+
+def run_uvicorn_server(
+    app, 
+    host='0.0.0.0', 
+    port=8500, 
+    workers=5, 
+    cleanup_interval=30, 
+    channel_timeout=900,
+    headers=[               
+        ("server", "DiffusersServer")
+    ],
+    enable_memory_monitor=True
+):
+    """
+    Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional
+    
+    Args:
+        app: Aplicación FastAPI
+        host (str): Host donde se servirá la aplicación
+        port (int): Puerto para el servidor
+        workers (int): Número de hilos para Uvicorn
+        cleanup_interval (int): Intervalo de limpieza para Uvicorn
+        channel_timeout (int): Tiempo de espera máximo para canales
+        server_header (bool): Activar el identificador / Header del servidor
+        headers (str): Identificador del servidor / Header del servidor
+        enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria
+        
+    Returns:
+        El resultado de serve() (aunque normalmente no retorna)
+    """
+    gc.enable()
+    gc.set_threshold(700, 10, 5)
+    
+    if enable_memory_monitor:
+        cleanup_thread = threading.Thread(
+            target=memory_cleanup, 
+            args=(cleanup_interval,), 
+            daemon=True
+        )
+        cleanup_thread.start()
+        logger.info("Monitor de memoria activado")
+    
+    logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...")
+
+    config = uvicorn.Config(
+        app=app,
+        host=host,
+        workers=workers,
+        port=port,
+        timeout_keep_alive=channel_timeout,
+        headers=headers
+    )
+
+    server = uvicorn.Server(config)
+
+    return server.serve()
\ No newline at end of file
diff --git a/examples/server-async/README.md b/examples/server-async/README.md
new file mode 100644
index 000000000000..a13529b7d555
--- /dev/null
+++ b/examples/server-async/README.md
@@ -0,0 +1,118 @@
+# Asynchronous server and parallel execution of models
+
+> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork.
+
+## ⚠️ IMPORTANT
+
+* This example uses a custom Diffusers fork: `https://github.com/F4k3r22/diffusers-async`.
+* The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`.
+  The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU.
+
+## Necessary components
+
+All the components needed to create the inference server are in `DiffusersServer/`
+
+```
+DiffusersServer/                 # the example server package
+├── __init__.py                   
+├── create_server.py             # helper script to build/run the app programmatically
+├── Pipelines.py                 # pipeline loader classes (SD3, Flux, legacy SD, video)
+├── serverasync.py               # FastAPI app factory (create_app_fastapi)
+├── superpipeline.py             # optional custom pipeline glue code
+├── uvicorn_diffu.py             # convenience script to start uvicorn with recommended flags
+```
+
+
+## What `diffusers-async` adds / Why we needed it
+
+Core problem: a naive server that calls `pipe.__call__` concurrently can hit **race conditions** (e.g., `scheduler.set_timesteps` mutates shared state) or explode memory by deep-copying the whole pipeline per-request.
+
+`diffusers-async` / this example addresses that by:
+
+* **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*.
+* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics.
+* **`retrieve_timesteps(..., return_scheduler=True)`**: retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. This is the safe path for getting a scheduler configured per-request.
+* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors.
+
+## How the server works (high-level flow)
+
+1. **Single model instance** is loaded into memory (GPU/MPS) when the server starts.
+2. On each HTTP inference request:
+
+   * The server uses `RequestScopedPipeline.generate(...)` which:
+
+     * obtains a *local scheduler* (via `clone_for_request()` or `deepcopy`),
+     * does `local_pipe = copy.copy(base_pipe)` (shallow copy),
+     * sets `local_pipe.scheduler = local_scheduler` (if possible),
+     * clones only small mutable attributes (callbacks, rng, small latents),
+     * optionally enters a `model_cpu_offload_context()` for memory offload hooks,
+     * calls the pipeline on the local view (`local_pipe(...)`).
+3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`).
+4. Multiple requests can run in parallel while sharing heavy weights and isolating mutable state.
+
+
+## How to set up and run the server
+
+### 1) Install dependencies
+
+Recommended: create a virtualenv / conda environment.
+
+If using the `diffusers` fork via git, either:
+
+**A) Preinstall the fork first (if you want to avoid hatch direct references):**
+
+```bash
+pip install "git+https://github.com/F4k3r22/diffusers-async.git@main"
+pip install -r requirements.txt
+```
+
+### 2) Start the server
+
+Using the `server.py` file that already has everything you need:
+
+```bash
+python server.py
+```
+
+### 3) Example request
+
+`POST /api/diffusers/inference` with JSON body:
+
+```json
+{
+  "prompt": "A futuristic cityscape, vibrant colors",
+  "num_inference_steps": 30,
+  "num_images_per_prompt": 1
+}
+```
+
+Response example:
+
+```json
+{
+  "response": ["http://localhost:8500/images/img123.png"]
+}
+```
+
+## Troubleshooting (quick)
+
+* `Already borrowed` — tokenizers (Rust) error when used concurrently.
+
+  * Workarounds:
+
+    * Acquire a `Lock` around tokenization or around the pipeline call (serializes that part).
+    * Use the slow tokenizer (`converter_to_slow`) for concurrency tests.
+    * Patch only the tokenization method to use a lock instead of serializing entire forward.
+* `can't set attribute 'components'` — pipeline exposes read-only `components`.
+
+  * The RequestScopedPipeline now detects read-only properties and skips setting them.
+* Scheduler issues:
+
+  * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
+
+
+## Integration notes / performance tips
+
+* **Compile UNet**: try `pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")` — measure before enabling compile widely. If compile fails, compile only the UNet or use `mode="reduce-overhead"`.
+* **Offload**: use `pipeline.enable_model_cpu_offload()` where appropriate to reduce peak GPU memory.
+* **Quantization**: bitsandbytes quantization reduces memory but may require extra torch.compile flags (e.g., `torch._dynamo.config.capture_dynamic_output_shape_ops = True`).
diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt
new file mode 100644
index 000000000000..50eeed9b2f9e
--- /dev/null
+++ b/examples/server-async/requirements.txt
@@ -0,0 +1,6 @@
+torch 
+torchvision 
+transformers 
+sentencepiece 
+fastapi 
+uvicorn 
\ No newline at end of file
diff --git a/examples/server-async/server.py b/examples/server-async/server.py
new file mode 100644
index 000000000000..590522038a53
--- /dev/null
+++ b/examples/server-async/server.py
@@ -0,0 +1,11 @@
+# DiffusersServerApp already handles the inference server and everything else internally, you 
+# just need to do these basic configurations and run the script with "python server.py" 
+# and you already get access to the inference APIs.
+from DiffusersServer import DiffusersServerApp
+
+app = DiffusersServerApp(
+    model='stabilityai/stable-diffusion-3.5-medium',
+    type_model='t2im',
+    threads=3,
+    enable_memory_monitor=True
+)
\ No newline at end of file
diff --git a/examples/server-async/test.py b/examples/server-async/test.py
new file mode 100644
index 000000000000..2a68c77bb28f
--- /dev/null
+++ b/examples/server-async/test.py
@@ -0,0 +1,60 @@
+import os
+import time
+import urllib.parse
+import requests
+
+SERVER_URL = "http://localhost:8500/api/diffusers/inference"
+BASE_URL = "http://localhost:8500"
+DOWNLOAD_FOLDER = "imagenes_generadas"
+WAIT_BEFORE_DOWNLOAD = 2  # seconds
+
+os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
+
+def save_from_url(url: str) -> str:
+    """Download the given URL (relative or absolute) and save it locally."""
+    if url.startswith("/"):
+        direct = BASE_URL.rstrip("/") + url
+    else:
+        direct = url
+    resp = requests.get(direct, timeout=60)
+    resp.raise_for_status()
+    filename = os.path.basename(urllib.parse.urlparse(direct).path) or f"img_{int(time.time())}.png"
+    path = os.path.join(DOWNLOAD_FOLDER, filename)
+    with open(path, "wb") as f:
+        f.write(resp.content)
+    return path
+
+def main():
+    payload = {
+        "prompt": "The T-800 Terminator Robot Returning From The Future, Anime Style",
+        "num_inference_steps": 30,
+        "num_images_per_prompt": 1
+    }
+
+    print("Sending request...")
+    try:
+        r = requests.post(SERVER_URL, json=payload, timeout=480)
+        r.raise_for_status()
+    except Exception as e:
+        print(f"Request failed: {e}")
+        return
+
+    body = r.json().get("response", [])
+    # Normalize to a list
+    urls = body if isinstance(body, list) else [body] if body else []
+    if not urls:
+        print("No URLs found in the response. Check the server output.")
+        return
+
+    print(f"Received {len(urls)} URL(s). Waiting {WAIT_BEFORE_DOWNLOAD}s before downloading...")
+    time.sleep(WAIT_BEFORE_DOWNLOAD)
+
+    for u in urls:
+        try:
+            path = save_from_url(u)
+            print(f"Image saved to: {path}")
+        except Exception as e:
+            print(f"Error downloading {u}: {e}")
+
+if __name__ == "__main__":
+    main()

From 5c7c7c6077181306b813e82228b74afc2f30ca32 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 7 Sep 2025 12:43:29 -0600
Subject: [PATCH 07/34] Add examples/server-async

---
 examples/server-async/README.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index a13529b7d555..10b4c1825098 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -59,7 +59,7 @@ Recommended: create a virtualenv / conda environment.
 
 If using the `diffusers` fork via git, either:
 
-**A) Preinstall the fork first (if you want to avoid hatch direct references):**
+**A) Preinstall the fork first:**
 
 ```bash
 pip install "git+https://github.com/F4k3r22/diffusers-async.git@main"
@@ -110,9 +110,3 @@ Response example:
 
   * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
 
-
-## Integration notes / performance tips
-
-* **Compile UNet**: try `pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")` — measure before enabling compile widely. If compile fails, compile only the UNet or use `mode="reduce-overhead"`.
-* **Offload**: use `pipeline.enable_model_cpu_offload()` where appropriate to reduce peak GPU memory.
-* **Quantization**: bitsandbytes quantization reduces memory but may require extra torch.compile flags (e.g., `torch._dynamo.config.capture_dynamic_output_shape_ops = True`).

From bd3e48a2af68e104840c7137cf755ad687920e68 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 11:50:09 -0600
Subject: [PATCH 08/34] Updated RequestScopedPipeline to handle a single
 tokenizer lock to avoid race conditions

---
 src/diffusers/pipelines/pipeline_utils.py | 137 ++++++++++++++++++++--
 1 file changed, 124 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index b05cf71568b1..42f70e6a7330 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable
 import copy
+import threading
 
 import numpy as np
 import PIL.Image
@@ -180,6 +181,34 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
+
+class _TokenizerLockWrapper:
+    def __init__(self, tokenizer: Any, lock: threading.Lock):
+        self._tokenizer = tokenizer
+        self._lock = lock
+
+    def __call__(self, *args, **kwargs):
+        with self._lock:
+            return self._tokenizer(*args, **kwargs)
+
+    # common tokenizer methods some codepaths call
+    def encode(self, *args, **kwargs):
+        with self._lock:
+            return getattr(self._tokenizer, "encode")(*args, **kwargs)
+
+    def batch_encode_plus(self, *args, **kwargs):
+        with self._lock:
+            return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs)
+
+    def encode_plus(self, *args, **kwargs):
+        with self._lock:
+            return getattr(self._tokenizer, "encode_plus")(*args, **kwargs)
+
+    # fallback: delegate any other attribute access to the original tokenizer
+    def __getattr__(self, name):
+        return getattr(self._tokenizer, name)
+
+
 class RequestScopedPipeline:
     DEFAULT_MUTABLE_ATTRS = [
         "_all_hooks",
@@ -197,6 +226,7 @@ def __init__(
         mutable_attrs: Optional[Iterable[str]] = None,
         auto_detect_mutables: bool = True,
         tensor_numel_threshold: int = 1_000_000,
+        tokenizer_lock: Optional[threading.Lock] = None
     ):
         self._base = pipeline
         self.unet = getattr(pipeline, "unet", None)
@@ -205,6 +235,7 @@ def __init__(
         self.components = getattr(pipeline, "components", None)
 
         self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
+        self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
 
         self._auto_detect_mutables = bool(auto_detect_mutables)
         self._tensor_numel_threshold = int(tensor_numel_threshold)
@@ -294,7 +325,7 @@ def _clone_mutable_attrs(self, base, local):
         attrs_to_clone = list(self._mutable_attrs)
         attrs_to_clone.extend(self._autodetect_mutables())
 
-        EXCLUDE_ATTRS = {"components",}  # añade más si encuentras otros problemáticos
+        EXCLUDE_ATTRS = {"components",}
 
         for attr in attrs_to_clone:
             if attr in EXCLUDE_ATTRS:
@@ -350,29 +381,109 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
         except Exception as e:
             logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
             local_pipe = copy.deepcopy(self._base)
+
         if local_scheduler is not None:
             try:
                 setattr(local_pipe, "scheduler", local_scheduler)
             except Exception:
                 logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
 
-
         self._clone_mutable_attrs(self._base, local_pipe)
 
+        # 4) wrap tokenizers on the local pipe with the lock wrapper
+        wrapped_tokenizers = {}  # name -> original_tokenizer
+        try:
+            # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
+            for name in dir(local_pipe):
+                if "tokenizer" in name and not name.startswith("_"):
+                    try:
+                        tok = getattr(local_pipe, name, None)
+                        if tok is None:
+                            continue
+                        # avoid double-wrapping
+                        if isinstance(tok, _TokenizerLockWrapper):
+                            continue
+                    # perform wrap
+                        originals_tok = tok
+                        try:
+                            setattr(local_pipe, name, _TokenizerLockWrapper(originals_tok, self._tokenizer_lock))
+                            wrapped_tokenizers[name] = originals_tok
+                        except Exception:
+                            logger.debug(f"Failed to wrap tokenizer attribute '{name}' with lock.")
+                    except Exception:
+                        # ignore attribute access errors
+                        continue
+
+        # b) also check components mapping if present (common pattern)
+            comps = getattr(local_pipe, "components", None)
+            if isinstance(comps, dict):
+                for key, val in list(comps.items()):
+                    # only handle values that look like tokenizers
+                    if key and "tokenizer" in str(key).lower():
+                        try:
+                            if isinstance(val, _TokenizerLockWrapper):
+                                continue
+                            wrapped_name = f"components[{key}]"
+                            local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock)
+                            wrapped_tokenizers[wrapped_name] = val
+                        except Exception:
+                            logger.debug(f"Failed to wrap components['{key}'] tokenizer with lock.")
+                    else:
+                    # sometimes tokenizers are stored as values with names that include 'tokenizer'
+                        try:
+                            if hasattr(val, "__class__") and "tokenizer" in val.__class__.__name__.lower():
+                                wrapped_name = f"components[{key}]"
+                                if isinstance(val, _TokenizerLockWrapper):
+                                    continue
+                                local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock)
+                                wrapped_tokenizers[wrapped_name] = val
+                        except Exception:
+                            continue
+
+        except Exception as e:
+            logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
+
+        # 5) run the pipeline, trying model_cpu_offload_context if available
+        result = None
         cm = getattr(local_pipe, "model_cpu_offload_context", None)
-        if callable(cm):
-            try:
-                with cm():
-                    return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-            except TypeError:
-                # puede ser que cm sea un context manager ya instanciado en vez de callable
+        try:
+            if callable(cm):
                 try:
-                    with cm:
-                        return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-                except Exception as e:
-                    logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
+                    with cm():
+                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                except TypeError:
+                    # cm might be a context manager instance rather than callable
+                    try:
+                        with cm:
+                            result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                    except Exception as e:
+                        logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
+                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+            else:
+            # no offload context available — call directly
+                result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 
-        return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+            return result
+
+        finally:
+        # 6) restore any wrapped tokenizers on local_pipe (best-effort, local_pipe will be GC'd)
+            try:
+            # restore direct attrs
+                for name, orig in list(wrapped_tokenizers.items()):
+                    if name.startswith("components["):
+                    # components entry
+                        key = name[len("components["):-1]
+                        try:
+                            local_pipe.components[key] = orig
+                        except Exception:
+                            pass
+                    else:
+                        try:
+                            setattr(local_pipe, name, orig)
+                        except Exception:
+                            pass
+            except Exception as e:
+                logger.debug(f"Error restoring wrapped tokenizers: {e}")
 
 
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):

From 534710c854726db50489bfb39846c66819d4c5e0 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 15:49:41 -0600
Subject: [PATCH 09/34] Fix

---
 src/diffusers/pipelines/pipeline_utils.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 42f70e6a7330..8eb9057a9fb2 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -191,22 +191,26 @@ def __call__(self, *args, **kwargs):
         with self._lock:
             return self._tokenizer(*args, **kwargs)
 
-    # common tokenizer methods some codepaths call
+    def __getattr__(self, name):
+        return getattr(self._tokenizer, name)
+
+    def __len__(self):
+        return len(self._tokenizer)
+
+    def __getitem__(self, item):
+        return self._tokenizer[item]
+
     def encode(self, *args, **kwargs):
         with self._lock:
-            return getattr(self._tokenizer, "encode")(*args, **kwargs)
+            return self._tokenizer.encode(*args, **kwargs)
 
     def batch_encode_plus(self, *args, **kwargs):
         with self._lock:
-            return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs)
+            return self._tokenizer.batch_encode_plus(*args, **kwargs)
 
     def encode_plus(self, *args, **kwargs):
         with self._lock:
-            return getattr(self._tokenizer, "encode_plus")(*args, **kwargs)
-
-    # fallback: delegate any other attribute access to the original tokenizer
-    def __getattr__(self, name):
-        return getattr(self._tokenizer, name)
+            return self._tokenizer.encode_plus(*args, **kwargs)
 
 
 class RequestScopedPipeline:

From 4d7c64feb65b9d97be016f57730bf5f7d319a15a Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 15:57:57 -0600
Subject: [PATCH 10/34] Fix _TokenizerLockWrapper

---
 src/diffusers/pipelines/pipeline_utils.py | 102 +++++++++++++++++++---
 1 file changed, 90 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 8eb9057a9fb2..6f9f2249d045 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -187,30 +187,108 @@ def __init__(self, tokenizer: Any, lock: threading.Lock):
         self._tokenizer = tokenizer
         self._lock = lock
 
+    # --- callables that must be protected by the lock ---
     def __call__(self, *args, **kwargs):
         with self._lock:
             return self._tokenizer(*args, **kwargs)
 
-    def __getattr__(self, name):
-        return getattr(self._tokenizer, name)
-
-    def __len__(self):
-        return len(self._tokenizer)
-
-    def __getitem__(self, item):
-        return self._tokenizer[item]
-
     def encode(self, *args, **kwargs):
         with self._lock:
-            return self._tokenizer.encode(*args, **kwargs)
+            return getattr(self._tokenizer, "encode")(*args, **kwargs)
 
     def batch_encode_plus(self, *args, **kwargs):
         with self._lock:
-            return self._tokenizer.batch_encode_plus(*args, **kwargs)
+            return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs)
 
     def encode_plus(self, *args, **kwargs):
         with self._lock:
-            return self._tokenizer.encode_plus(*args, **kwargs)
+            return getattr(self._tokenizer, "encode_plus")(*args, **kwargs)
+
+    # --- attribute delegation for everything else ---
+    def __getattr__(self, name):
+        # Called only if attribute is not found on this wrapper;
+        # delegate to the real tokenizer
+        return getattr(self._tokenizer, name)
+
+    def __repr__(self):
+        return f"<TokenizerLockWrapper for {repr(self._tokenizer)}>"
+
+    def __str__(self):
+        return str(self._tokenizer)
+
+    def __len__(self):
+        try:
+            return len(self._tokenizer)
+        except Exception:
+            return 0
+
+    def __iter__(self):
+        return iter(self._tokenizer)
+
+    def __contains__(self, item):
+        try:
+            return item in self._tokenizer
+        except Exception:
+            return False
+
+    def __getitem__(self, key):
+        return self._tokenizer[key]
+
+    # --- numeric / comparison support (crucial to fix your TypeError) ---
+    def _as_int(self) -> int:
+        """
+        Best-effort integer representation for comparisons:
+        prefer vocab_size, then model_max_length-like attributes, then len(tokenizer), else 0.
+        """
+        for attr in ("vocab_size", "vocab_size_base", "model_max_length", "max_len_single_sentence", "max_len"):
+            val = getattr(self._tokenizer, attr, None)
+            if isinstance(val, int):
+                return val
+        try:
+            return int(len(self._tokenizer))
+        except Exception:
+            return 0
+
+    def __int__(self):
+        return self._as_int()
+
+    def __index__(self):
+        return self._as_int()
+
+    # rich comparisons - delegate to integer representation when compared with numbers
+    def __lt__(self, other):
+        try:
+            return self._as_int() < int(other)
+        except Exception:
+            return NotImplemented
+
+    def __le__(self, other):
+        try:
+            return self._as_int() <= int(other)
+        except Exception:
+            return NotImplemented
+
+    def __gt__(self, other):
+        try:
+            return self._as_int() > int(other)
+        except Exception:
+            return NotImplemented
+
+    def __ge__(self, other):
+        try:
+            return self._as_int() >= int(other)
+        except Exception:
+            return NotImplemented
+
+    def __eq__(self, other):
+        # equality: unwrap if other is also wrapper
+        if isinstance(other, _TokenizerLockWrapper):
+            return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other._tokenizer)
+        return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
 
 
 class RequestScopedPipeline:

From 18db9e6ff7e9199f9c1f93e5054257cea27884ec Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 16:02:14 -0600
Subject: [PATCH 11/34] Fix _TokenizerLockWrapper

---
 src/diffusers/pipelines/pipeline_utils.py | 86 +++--------------------
 1 file changed, 10 insertions(+), 76 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 6f9f2249d045..6de9fbb14380 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -183,11 +183,10 @@ def __init__(self, *args, **kwargs):
 
 
 class _TokenizerLockWrapper:
-    def __init__(self, tokenizer: Any, lock: threading.Lock):
+    def __init__(self, tokenizer, lock):
         self._tokenizer = tokenizer
         self._lock = lock
 
-    # --- callables that must be protected by the lock ---
     def __call__(self, *args, **kwargs):
         with self._lock:
             return self._tokenizer(*args, **kwargs)
@@ -204,90 +203,25 @@ def encode_plus(self, *args, **kwargs):
         with self._lock:
             return getattr(self._tokenizer, "encode_plus")(*args, **kwargs)
 
-    # --- attribute delegation for everything else ---
     def __getattr__(self, name):
-        # Called only if attribute is not found on this wrapper;
-        # delegate to the real tokenizer
         return getattr(self._tokenizer, name)
 
-    def __repr__(self):
-        return f"<TokenizerLockWrapper for {repr(self._tokenizer)}>"
-
-    def __str__(self):
-        return str(self._tokenizer)
-
-    def __len__(self):
-        try:
-            return len(self._tokenizer)
-        except Exception:
-            return 0
-
-    def __iter__(self):
-        return iter(self._tokenizer)
-
-    def __contains__(self, item):
-        try:
-            return item in self._tokenizer
-        except Exception:
-            return False
-
-    def __getitem__(self, key):
-        return self._tokenizer[key]
-
-    # --- numeric / comparison support (crucial to fix your TypeError) ---
-    def _as_int(self) -> int:
-        """
-        Best-effort integer representation for comparisons:
-        prefer vocab_size, then model_max_length-like attributes, then len(tokenizer), else 0.
-        """
-        for attr in ("vocab_size", "vocab_size_base", "model_max_length", "max_len_single_sentence", "max_len"):
-            val = getattr(self._tokenizer, attr, None)
-            if isinstance(val, int):
-                return val
-        try:
-            return int(len(self._tokenizer))
-        except Exception:
-            return 0
-
     def __int__(self):
-        return self._as_int()
-
-    def __index__(self):
-        return self._as_int()
+        return getattr(self._tokenizer, "vocab_size", 0)
 
-    # rich comparisons - delegate to integer representation when compared with numbers
     def __lt__(self, other):
-        try:
-            return self._as_int() < int(other)
-        except Exception:
-            return NotImplemented
-
+        try: return int(self) < int(other)
+        except Exception: return NotImplemented
     def __le__(self, other):
-        try:
-            return self._as_int() <= int(other)
-        except Exception:
-            return NotImplemented
-
+        try: return int(self) <= int(other)
+        except Exception: return NotImplemented
     def __gt__(self, other):
-        try:
-            return self._as_int() > int(other)
-        except Exception:
-            return NotImplemented
-
+        try: return int(self) > int(other)
+        except Exception: return NotImplemented
     def __ge__(self, other):
-        try:
-            return self._as_int() >= int(other)
-        except Exception:
-            return NotImplemented
-
-    def __eq__(self, other):
-        # equality: unwrap if other is also wrapper
-        if isinstance(other, _TokenizerLockWrapper):
-            return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other._tokenizer)
-        return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other)
+        try: return int(self) >= int(other)
+        except Exception: return NotImplemented
 
-    def __ne__(self, other):
-        return not self.__eq__(other)
 
 
 

From 8f0efb1a456ba5686cedaf212bb6520b37770367 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 16:18:09 -0600
Subject: [PATCH 12/34] Delete _TokenizerLockWrapper

---
 src/diffusers/pipelines/pipeline_utils.py | 120 ++++------------------
 1 file changed, 22 insertions(+), 98 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 6de9fbb14380..45339e833c78 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -181,48 +181,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-
-class _TokenizerLockWrapper:
-    def __init__(self, tokenizer, lock):
-        self._tokenizer = tokenizer
-        self._lock = lock
-
-    def __call__(self, *args, **kwargs):
-        with self._lock:
-            return self._tokenizer(*args, **kwargs)
-
-    def encode(self, *args, **kwargs):
-        with self._lock:
-            return getattr(self._tokenizer, "encode")(*args, **kwargs)
-
-    def batch_encode_plus(self, *args, **kwargs):
-        with self._lock:
-            return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs)
-
-    def encode_plus(self, *args, **kwargs):
-        with self._lock:
-            return getattr(self._tokenizer, "encode_plus")(*args, **kwargs)
-
-    def __getattr__(self, name):
-        return getattr(self._tokenizer, name)
-
-    def __int__(self):
-        return getattr(self._tokenizer, "vocab_size", 0)
-
-    def __lt__(self, other):
-        try: return int(self) < int(other)
-        except Exception: return NotImplemented
-    def __le__(self, other):
-        try: return int(self) <= int(other)
-        except Exception: return NotImplemented
-    def __gt__(self, other):
-        try: return int(self) > int(other)
-        except Exception: return NotImplemented
-    def __ge__(self, other):
-        try: return int(self) >= int(other)
-        except Exception: return NotImplemented
-
-
+def safe_tokenize(tokenizer, *args, lock, **kwargs):
+    with lock:
+        return tokenizer(*args, **kwargs)
 
 
 class RequestScopedPipeline:
@@ -407,59 +368,31 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
         self._clone_mutable_attrs(self._base, local_pipe)
 
         # 4) wrap tokenizers on the local pipe with the lock wrapper
-        wrapped_tokenizers = {}  # name -> original_tokenizer
+        tokenizer_wrappers = {}  # name -> original_tokenizer
         try:
             # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
             for name in dir(local_pipe):
                 if "tokenizer" in name and not name.startswith("_"):
-                    try:
-                        tok = getattr(local_pipe, name, None)
-                        if tok is None:
-                            continue
-                        # avoid double-wrapping
-                        if isinstance(tok, _TokenizerLockWrapper):
-                            continue
-                    # perform wrap
-                        originals_tok = tok
-                        try:
-                            setattr(local_pipe, name, _TokenizerLockWrapper(originals_tok, self._tokenizer_lock))
-                            wrapped_tokenizers[name] = originals_tok
-                        except Exception:
-                            logger.debug(f"Failed to wrap tokenizer attribute '{name}' with lock.")
-                    except Exception:
-                        # ignore attribute access errors
+                    tok = getattr(local_pipe, name, None)
+                    if tok is not None:
+                        tokenizer_wrappers[name] = tok
+                        setattr(
+                        local_pipe,
+                        name,
+                        lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
+                        )
+
+            if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
+                for key, val in local_pipe.components.items():
+                    if val is None:
                         continue
-
-        # b) also check components mapping if present (common pattern)
-            comps = getattr(local_pipe, "components", None)
-            if isinstance(comps, dict):
-                for key, val in list(comps.items()):
-                    # only handle values that look like tokenizers
-                    if key and "tokenizer" in str(key).lower():
-                        try:
-                            if isinstance(val, _TokenizerLockWrapper):
-                                continue
-                            wrapped_name = f"components[{key}]"
-                            local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock)
-                            wrapped_tokenizers[wrapped_name] = val
-                        except Exception:
-                            logger.debug(f"Failed to wrap components['{key}'] tokenizer with lock.")
-                    else:
-                    # sometimes tokenizers are stored as values with names that include 'tokenizer'
-                        try:
-                            if hasattr(val, "__class__") and "tokenizer" in val.__class__.__name__.lower():
-                                wrapped_name = f"components[{key}]"
-                                if isinstance(val, _TokenizerLockWrapper):
-                                    continue
-                                local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock)
-                                wrapped_tokenizers[wrapped_name] = val
-                        except Exception:
-                            continue
+                    if "tokenizer" in str(key).lower() or "tokenizer" in val.__class__.__name__.lower():
+                        tokenizer_wrappers[f"components[{key}]"] = val
+                    local_pipe.components[key] = lambda *args, tok=val, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
 
         except Exception as e:
             logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
 
-        # 5) run the pipeline, trying model_cpu_offload_context if available
         result = None
         cm = getattr(local_pipe, "model_cpu_offload_context", None)
         try:
@@ -482,22 +415,13 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             return result
 
         finally:
-        # 6) restore any wrapped tokenizers on local_pipe (best-effort, local_pipe will be GC'd)
             try:
-            # restore direct attrs
-                for name, orig in list(wrapped_tokenizers.items()):
+                for name, tok in tokenizer_wrappers.items():
                     if name.startswith("components["):
-                    # components entry
                         key = name[len("components["):-1]
-                        try:
-                            local_pipe.components[key] = orig
-                        except Exception:
-                            pass
+                        local_pipe.components[key] = tok
                     else:
-                        try:
-                            setattr(local_pipe, name, orig)
-                        except Exception:
-                            pass
+                        setattr(local_pipe, name, tok)
             except Exception as e:
                 logger.debug(f"Error restoring wrapped tokenizers: {e}")
 

From b47903911e00121f895460dcaa250ba018cca842 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 16:26:18 -0600
Subject: [PATCH 13/34] Fix tokenizer

---
 src/diffusers/pipelines/pipeline_utils.py | 41 ++++++++++++++++++-----
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 45339e833c78..a36f8c803d3d 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -349,6 +349,24 @@ def _clone_mutable_attrs(self, base, local):
                 logger.debug(f"Unexpected error cloning attribute '{attr}': {e}")
                 continue
 
+    def _is_tokenizer_component(self, component) -> bool:
+        """Determina si un componente es un tokenizador basándose en métodos y atributos comunes."""
+        if component is None:
+            return False
+        
+        # Verificar métodos comunes de tokenizadores
+        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
+        has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
+        
+        # Verificar nombre de clase
+        class_name = component.__class__.__name__.lower()
+        has_tokenizer_in_name = 'tokenizer' in class_name
+        
+        # Verificar atributos comunes de tokenizadores
+        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
+        has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
+        
+        return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
 
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
         local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
@@ -374,21 +392,27 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             for name in dir(local_pipe):
                 if "tokenizer" in name and not name.startswith("_"):
                     tok = getattr(local_pipe, name, None)
-                    if tok is not None:
+                    if tok is not None and self._is_tokenizer_component(tok):
                         tokenizer_wrappers[name] = tok
                         setattr(
-                        local_pipe,
-                        name,
-                        lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
+                            local_pipe,
+                            name,
+                            lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
                         )
 
+            # b) wrap tokenizers in components dict - CORRECCIÓN CRÍTICA
             if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
                 for key, val in local_pipe.components.items():
                     if val is None:
                         continue
-                    if "tokenizer" in str(key).lower() or "tokenizer" in val.__class__.__name__.lower():
+                    
+                    # Solo envolver si realmente ES un tokenizador
+                    if self._is_tokenizer_component(val):
                         tokenizer_wrappers[f"components[{key}]"] = val
-                    local_pipe.components[key] = lambda *args, tok=val, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
+                        # Crear una nueva función lambda que capture correctamente 'val'
+                        local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
+                            tokenizer, *args, lock=self._tokenizer_lock, **kwargs
+                        )
 
         except Exception as e:
             logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
@@ -409,13 +433,14 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                         logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
                         result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
             else:
-            # no offload context available — call directly
+                # no offload context available — call directly
                 result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 
             return result
 
         finally:
             try:
+                # Restaurar los tokenizadores originales
                 for name, tok in tokenizer_wrappers.items():
                     if name.startswith("components["):
                         key = name[len("components["):-1]
@@ -425,7 +450,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             except Exception as e:
                 logger.debug(f"Error restoring wrapped tokenizers: {e}")
 
-
+            
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):
     r"""
     Base class for all pipelines.

From 0beab1cf7841723e36b8b982cfb509d60b659b8c Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 10 Sep 2025 20:59:05 -0600
Subject: [PATCH 14/34] Update examples/server-async

---
 .../server-async/DiffusersServer/Pipelines.py |  25 +--
 .../DiffusersServer/create_server.py          |   2 +-
 .../DiffusersServer/serverasync.py            | 179 ++++++++++++------
 .../DiffusersServer/superpipeline.py          |   2 -
 .../DiffusersServer/uvicorn_diffu.py          |  26 +--
 examples/server-async/requirements.txt        |   3 +-
 6 files changed, 123 insertions(+), 114 deletions(-)

diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py
index 648f708fd562..66391b89560a 100644
--- a/examples/server-async/DiffusersServer/Pipelines.py
+++ b/examples/server-async/DiffusersServer/Pipelines.py
@@ -1,4 +1,4 @@
-# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/Pipelines.py
+# Pipelines.py
 
 from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
@@ -18,22 +18,12 @@ class TextToImageInput(BaseModel):
 
 class TextToImagePipelineSD3:
     def __init__(self, model_path: str | None = None):
-        """
-        Inicialización de la clase con la ruta del modelo.
-        Si no se proporciona, se obtiene de la variable de entorno.
-        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: StableDiffusion3Pipeline = None
         self.device: str = None
 
     def start(self):
-        """
-        Inicia el pipeline cargando el modelo en CUDA o MPS según esté disponible.
-        Se utiliza la ruta del modelo definida en el __init__ y se asigna un valor predeterminado
-        en función del dispositivo disponible si no se definió previamente.
-        """
         if torch.cuda.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
             logger.info("Loading CUDA")
             self.device = "cuda"
@@ -42,7 +32,6 @@ def start(self):
                 torch_dtype=torch.float16,
             ).to(device=self.device)
         elif torch.backends.mps.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
@@ -55,10 +44,6 @@ def start(self):
 
 class TextToImagePipelineFlux:
     def __init__(self, model_path: str | None = None, low_vram: bool = False):
-        """
-        Inicialización de la clase con la ruta del modelo.
-        Si no se proporciona, se obtiene de la variable de entorno.
-        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: FluxPipeline = None
         self.device: str = None
@@ -66,7 +51,6 @@ def __init__(self, model_path: str | None = None, low_vram: bool = False):
 
     def start(self):
         if torch.cuda.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -79,7 +63,6 @@ def start(self):
             else:
                 pass
         elif torch.backends.mps.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
@@ -92,17 +75,12 @@ def start(self):
 
 class TextToImagePipelineSD:
     def __init__(self, model_path: str | None = None):
-        """
-        Inicialización de la clase con la ruta del modelo.
-        Si no se proporciona, se obtiene de la variable de entorno.
-        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: StableDiffusionPipeline = None
         self.device: str = None
 
     def start(self):
         if torch.cuda.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -111,7 +89,6 @@ def start(self):
                 torch_dtype=torch.float16,
             ).to(device=self.device)
         elif torch.backends.mps.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py
index a5e6357db9d7..7ccfd9c742f8 100644
--- a/examples/server-async/DiffusersServer/create_server.py
+++ b/examples/server-async/DiffusersServer/create_server.py
@@ -1,4 +1,4 @@
-# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/create_server.py
+# create_server.py
 
 from .Pipelines import *
 from .serverasync import *
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index 303f1aa31b3f..78e1d44f4119 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -1,6 +1,4 @@
-# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/serverasync.py
-
-from fastapi import FastAPI, HTTPException, status
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import FileResponse  
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.concurrency import run_in_threadpool
@@ -22,6 +20,8 @@
 from typing import Optional, Dict, Any, Type
 from dataclasses import dataclass, field
 from typing import List
+from contextlib import asynccontextmanager
+import asyncio
 
 @dataclass
 class PresetModels:
@@ -114,11 +114,11 @@ def save_video(self, video, fps):
 
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium' 
+    model: str = 'stabilityai/stable-diffusion-3-medium'  
     type_models: str = 't2im' 
     custom_model : bool = False
     constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None  
+    custom_pipeline: Optional[Type] = None 
     components: Optional[Dict[str, Any]] = None
     api_name: Optional[str] = 'custom_api'
     torch_dtype: Optional[torch.dtype] = None
@@ -126,7 +126,96 @@ class ServerConfigModels:
     port: int = 8500
 
 def create_app_fastapi(config: ServerConfigModels) -> FastAPI:
-    app = FastAPI()
+
+    server_config = config or ServerConfigModels()
+
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        logging.basicConfig(level=logging.INFO)
+        app.state.logger = logging.getLogger("diffusers-server")
+
+        app.state.total_requests = 0
+        app.state.active_inferences = 0
+        app.state.metrics_lock = asyncio.Lock()
+        app.state.metrics_task = None
+
+        app.state.utils_app = Utils(
+            host=server_config.host,
+            port=server_config.port,
+        )
+
+        async def metrics_loop():
+            try:
+                while True:
+                    async with app.state.metrics_lock:
+                        total = app.state.total_requests
+                        active = app.state.active_inferences
+                    app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}")
+                    await asyncio.sleep(5)
+            except asyncio.CancelledError:
+                app.state.logger.info("Metrics loop cancelled")
+                raise
+
+        app.state.metrics_task = asyncio.create_task(metrics_loop())
+
+        try:
+            yield
+        finally:
+            # 🔻 shutdown
+            task = app.state.metrics_task
+            if task:
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
+
+            try:
+                stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
+                if callable(stop_fn):
+                    await run_in_threadpool(stop_fn)
+            except Exception as e:
+                app.state.logger.warning(f"Error during pipeline shutdown: {e}")
+
+            app.state.logger.info("Lifespan shutdown complete")
+
+    
+
+    app = FastAPI(lifespan=lifespan)
+
+    logger = logging.getLogger("DiffusersServer.Pipelines")
+
+    if server_config.custom_model:
+        if server_config.constructor_pipeline is None:
+            raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required")
+
+        initializer = server_config.constructor_pipeline(
+            model_path=server_config.model,
+            pipeline=server_config.custom_pipeline,
+            torch_dtype=server_config.torch_dtype,
+            components=server_config.components,
+        )
+        model_pipeline = initializer.start()
+        request_pipe = None
+        pipeline_lock = threading.Lock()
+
+    else:
+        initializer = ModelPipelineInitializer(
+            model=server_config.model,
+            type_models=server_config.type_models,
+        )
+        model_pipeline = initializer.initialize_pipeline()
+        model_pipeline.start()
+
+        request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
+        pipeline_lock = threading.Lock()
+
+    logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
+
+    app.state.MODEL_INITIALIZER = initializer
+    app.state.MODEL_PIPELINE = model_pipeline
+    app.state.REQUEST_PIPE = request_pipe
+    app.state.PIPELINE_LOCK = pipeline_lock
 
     class JSONBodyQueryAPI(BaseModel):
         model : str | None = None
@@ -135,54 +224,12 @@ class JSONBodyQueryAPI(BaseModel):
         num_inference_steps : int = 28
         num_images_per_prompt : int = 1
 
-    logging.basicConfig(level=logging.INFO)
-    global logger
-    logger = logging.getLogger(__name__)
-
-    server_config = config or ServerConfigModels()
-    app.state.SERVER_CONFIG = server_config
-
-    global utils_app
-
-    utils_app = Utils(host=server_config.host, port=server_config.port)
-
-    logger.info(f"Inicializando pipeline para el modelo: {server_config.model}")
-    try:
-        if server_config.custom_model:
-            if server_config.constructor_pipeline is None:
-                raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required")
-            initializer = server_config.constructor_pipeline(
-                model_path=server_config.model,
-                pipeline=server_config.custom_pipeline,
-                torch_dtype=server_config.torch_dtype,
-                components=server_config.components,
-            )
-            model_pipeline = initializer.start()
-            app.state.CUSTOM_PIPELINE = server_config.custom_pipeline
-            app.state.MODEL_PIPELINE = model_pipeline
-            app.state.MODEL_INITIALIZER = initializer
-            logger.info(f"Pipeline personalizado inicializado. Tipo: {type(model_pipeline)}")
-        else:
-            initializer = ModelPipelineInitializer(
-                model=server_config.model,
-                type_models=server_config.type_models,
-            )
-            model_pipeline = initializer.initialize_pipeline()
-            model_pipeline.start()
-
-            app.state.REQUEST_PIPE = RequestScopedPipeline(model_pipeline.pipeline)
-
-            # Lock for concurrency
-            pipeline_lock = threading.Lock()
-
-            app.state.MODEL_PIPELINE = model_pipeline
-            app.state.PIPELINE_LOCK = pipeline_lock
-            app.state.MODEL_INITIALIZER = initializer
-
-        logger.info("Pipeline initialized and ready to receive requests")
-    except Exception as e:
-        logger.error(f"Error initializing pipeline: {e}")
-        raise
+    @app.middleware("http")
+    async def count_requests_middleware(request: Request, call_next):
+        async with app.state.metrics_lock:
+            app.state.total_requests += 1
+        response = await call_next(request)
+        return response
 
 
     @app.get("/")
@@ -196,14 +243,16 @@ async def api(json: JSONBodyQueryAPI):
         num_steps             = json.num_inference_steps
         num_images_per_prompt = json.num_images_per_prompt
 
-        wrapper     = app.state.MODEL_PIPELINE
+        wrapper     = app.state.MODEL_PIPELINE   
         initializer = app.state.MODEL_INITIALIZER
 
+        utils_app = app.state.utils_app
+
 
         if not wrapper or not wrapper.pipeline:
-            raise HTTPException(500, "Modelo no inicializado correctamente")
+            raise HTTPException(500, "Model not initialized correctly")
         if not prompt.strip():
-            raise HTTPException(400, "No se proporcionó prompt")
+            raise HTTPException(400, "No prompt provided")
 
         def make_generator():
             g = torch.Generator(device=initializer.device)
@@ -212,9 +261,6 @@ def make_generator():
         req_pipe = app.state.REQUEST_PIPE
 
         def infer():
-            # This is called that because the RequestScoped Pipeline already internally 
-            # handles everything necessary for inference and only the 
-            # model pipeline needs to be passed, for example StableDiffusion3Pipeline
             gen = make_generator()
             return req_pipe.generate(
                 prompt=prompt,
@@ -226,14 +272,22 @@ def infer():
             )
 
         try:
+            async with app.state.metrics_lock:
+                app.state.active_inferences += 1
+
             output = await run_in_threadpool(infer)
 
+            async with app.state.metrics_lock:
+                app.state.active_inferences = max(0, app.state.active_inferences - 1)
+
             urls = [utils_app.save_image(img) for img in output.images]
             return {"response": urls}
 
         except Exception as e:
-            logger.error(f"Error durante la inferencia: {e}")
-            raise HTTPException(500, f"Error en procesamiento: {e}")
+            async with app.state.metrics_lock:
+                app.state.active_inferences = max(0, app.state.active_inferences - 1)
+            logger.error(f"Error during inference: {e}")
+            raise HTTPException(500, f"Error in processing: {e}")
 
         finally:
             import gc; gc.collect()
@@ -243,6 +297,7 @@ def infer():
 
     @app.get("/images/{filename}")
     async def serve_image(filename: str):
+        utils_app = app.state.utils_app
         file_path = os.path.join(utils_app.image_dir, filename)
         if not os.path.isfile(file_path):
             raise HTTPException(status_code=404, detail="Image not found")
diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py
index 394ebac39011..8f5064c1f04a 100644
--- a/examples/server-async/DiffusersServer/superpipeline.py
+++ b/examples/server-async/DiffusersServer/superpipeline.py
@@ -1,5 +1,3 @@
-# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/superpipeline.py
-
 from diffusers.pipelines import *
 from diffusers  import *
 import torch
diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py
index 7e19b50f3cbe..437e4961f4d4 100644
--- a/examples/server-async/DiffusersServer/uvicorn_diffu.py
+++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py
@@ -1,5 +1,3 @@
-# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/uvicorn_diffu.py
-
 import uvicorn
 import logging
 import gc
@@ -7,7 +5,6 @@
 import os
 import threading
 import time
-import string
 
 def setup_logging():
     logging.basicConfig(level=logging.INFO)
@@ -18,10 +15,8 @@ def setup_logging():
 def memory_cleanup(interval=30):
     while True:
         try:
-            
             gc.collect()
             
-
             process = psutil.Process(os.getpid())
             mem = process.memory_info().rss / 1024 / 1024
             logger.info(f"Memoria en uso: {mem:.2f} MB")
@@ -43,23 +38,6 @@ def run_uvicorn_server(
     ],
     enable_memory_monitor=True
 ):
-    """
-    Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional
-    
-    Args:
-        app: Aplicación FastAPI
-        host (str): Host donde se servirá la aplicación
-        port (int): Puerto para el servidor
-        workers (int): Número de hilos para Uvicorn
-        cleanup_interval (int): Intervalo de limpieza para Uvicorn
-        channel_timeout (int): Tiempo de espera máximo para canales
-        server_header (bool): Activar el identificador / Header del servidor
-        headers (str): Identificador del servidor / Header del servidor
-        enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria
-        
-    Returns:
-        El resultado de serve() (aunque normalmente no retorna)
-    """
     gc.enable()
     gc.set_threshold(700, 10, 5)
     
@@ -70,9 +48,9 @@ def run_uvicorn_server(
             daemon=True
         )
         cleanup_thread.start()
-        logger.info("Monitor de memoria activado")
+        logger.info("Memory monitor activated")
     
-    logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...")
+    logger.info(f"Starting Uvicorn server in {host}:{port}...")
 
     config = uvicorn.Config(
         app=app,
diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt
index 50eeed9b2f9e..b7a30ef45da8 100644
--- a/examples/server-async/requirements.txt
+++ b/examples/server-async/requirements.txt
@@ -3,4 +3,5 @@ torchvision
 transformers 
 sentencepiece 
 fastapi 
-uvicorn 
\ No newline at end of file
+uvicorn 
+fifty
\ No newline at end of file

From 840f0e4a7ab1d1819a6a60df200e48dd91b87d6f Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Thu, 11 Sep 2025 15:20:57 -0600
Subject: [PATCH 15/34] Fix server-async

---
 .../server-async/DiffusersServer/Pipelines.py | 116 +++++++++++-
 .../DiffusersServer/serverasync.py            | 171 ++++++++++++++----
 .../DiffusersServer/superpipeline.py          |   8 +
 .../DiffusersServer/uvicorn_diffu.py          |  32 +++-
 examples/server-async/requirements.txt        |   2 +-
 5 files changed, 288 insertions(+), 41 deletions(-)

diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py
index 66391b89560a..087c4cbd380a 100644
--- a/examples/server-async/DiffusersServer/Pipelines.py
+++ b/examples/server-async/DiffusersServer/Pipelines.py
@@ -7,6 +7,7 @@
 import os
 import logging
 from pydantic import BaseModel
+import gc
 
 logger = logging.getLogger(__name__)
 
@@ -19,31 +20,126 @@ class TextToImageInput(BaseModel):
 class TextToImagePipelineSD3:
     def __init__(self, model_path: str | None = None):
         self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: StableDiffusion3Pipeline = None
-        self.device: str = None
-
+        self.pipeline: StableDiffusion3Pipeline | None = None
+        self.device: str | None = None
+        
     def start(self):
+        torch.set_float32_matmul_precision("high")
+        
+        if hasattr(torch._inductor, 'config'):
+            if hasattr(torch._inductor.config, 'conv_1x1_as_mm'):
+                torch._inductor.config.conv_1x1_as_mm = True
+            if hasattr(torch._inductor.config, 'coordinate_descent_tuning'):
+                torch._inductor.config.coordinate_descent_tuning = True
+            if hasattr(torch._inductor.config, 'epilogue_fusion'):
+                torch._inductor.config.epilogue_fusion = False
+            if hasattr(torch._inductor.config, 'coordinate_descent_check_all_directions'):
+                torch._inductor.config.coordinate_descent_check_all_directions = True
+        
+        if torch.cuda.is_available():
+            torch.backends.cudnn.benchmark = True
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.deterministic = False
+            torch.backends.cudnn.allow_tf32 = True
+        
+        
         if torch.cuda.is_available():
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
-            logger.info("Loading CUDA")
+            logger.info(f"Loading CUDA with model: {model_path}")
             self.device = "cuda"
+            
+            torch.cuda.empty_cache()
+            gc.collect()
+            
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
                 model_path,
                 torch_dtype=torch.float16,
-            ).to(device=self.device)
+                use_safetensors=True,
+                variant="fp16" if "fp16" in model_path else None,
+                low_cpu_mem_usage=True,
+            )
+            
+            self.pipeline = self.pipeline.to(device=self.device)
+            
+            if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
+                self.pipeline.transformer = self.pipeline.transformer.to(
+                    memory_format=torch.channels_last
+                )
+                logger.info("Transformer optimized with channels_last format")
+            
+            if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None:
+                self.pipeline.vae = self.pipeline.vae.to(
+                    memory_format=torch.channels_last
+                )
+                logger.info("VAE optimized with channels_last format")
+            
+            try:
+                self.pipeline.enable_xformers_memory_efficient_attention()
+                logger.info("XFormers memory efficient attention enabled")
+            except Exception as e:
+                logger.info(f"XFormers not available: {e}")
+            
+            # --- Se descarta torch.compile pero se mantiene el resto ---
+            if torch.__version__ >= "2.0.0":
+                logger.info("Skipping torch.compile - running without compile optimizations by design")
+            
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            
+            logger.info("CUDA pipeline fully optimized and ready")
+            
         elif torch.backends.mps.is_available():
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
-            logger.info("Loading MPS for Mac M Series")
+            logger.info(f"Loading MPS for Mac M Series with model: {model_path}")
             self.device = "mps"
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
                 model_path,
                 torch_dtype=torch.bfloat16,
+                use_safetensors=True,
+                low_cpu_mem_usage=True,
             ).to(device=self.device)
+
+            if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
+                self.pipeline.transformer = self.pipeline.transformer.to(
+                    memory_format=torch.channels_last
+                )
+            
+            if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None:
+                self.pipeline.vae = self.pipeline.vae.to(
+                    memory_format=torch.channels_last
+                )
+            
+                
+            logger.info("MPS pipeline optimized and ready")
+            
         else:
             raise Exception("No CUDA or MPS device available")
+        
+        # OPTIONAL WARMUP
+        self._warmup()
+        
+        logger.info("Pipeline initialization completed successfully")
+    
+    def _warmup(self):
+        if self.pipeline:
+            logger.info("Running warmup inference...")
+            with torch.no_grad():
+                _ = self.pipeline(
+                    prompt="warmup",
+                    num_inference_steps=1,
+                    height=512,
+                    width=512,
+                    guidance_scale=1.0,
+                )
+            torch.cuda.empty_cache() if self.device == "cuda" else None
+            logger.info("Warmup completed")
 
 class TextToImagePipelineFlux:
     def __init__(self, model_path: str | None = None, low_vram: bool = False):
+        """
+        Inicialización de la clase con la ruta del modelo.
+        Si no se proporciona, se obtiene de la variable de entorno.
+        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: FluxPipeline = None
         self.device: str = None
@@ -51,6 +147,7 @@ def __init__(self, model_path: str | None = None, low_vram: bool = False):
 
     def start(self):
         if torch.cuda.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -63,6 +160,7 @@ def start(self):
             else:
                 pass
         elif torch.backends.mps.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
@@ -75,12 +173,17 @@ def start(self):
 
 class TextToImagePipelineSD:
     def __init__(self, model_path: str | None = None):
+        """
+        Inicialización de la clase con la ruta del modelo.
+        Si no se proporciona, se obtiene de la variable de entorno.
+        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: StableDiffusionPipeline = None
         self.device: str = None
 
     def start(self):
         if torch.cuda.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -89,6 +192,7 @@ def start(self):
                 torch_dtype=torch.float16,
             ).to(device=self.device)
         elif torch.backends.mps.is_available():
+            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index 78e1d44f4119..e7e056786c5d 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -1,3 +1,5 @@
+# Voy a mudar todo el servidor a un servidor asincrono con FastAPI y Uvicorn
+# Mientras complete esto, el servidor actual sigue funcionando
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import FileResponse  
 from fastapi.middleware.cors import CORSMiddleware
@@ -22,6 +24,17 @@
 from typing import List
 from contextlib import asynccontextmanager
 import asyncio
+from PIL import Image
+
+"""
+The goal is to create image generation, editing, and variance endpoints compatible with the OpenAI client.
+
+APIs:
+
+POST /images/variations (create_variation)
+POST /images/edits (edit)
+POST /images/generations (generate)
+"""
 
 @dataclass
 class PresetModels:
@@ -80,30 +93,96 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500):
         if not os.path.exists(self.video_dir):
             os.makedirs(self.video_dir)
 
-    def save_image(self, image):
-        if hasattr(image, "to"):
-            try:
-                image = image.to("cpu")
-            except Exception:
-                pass
+    def _tensor_to_pil_minimal(self, tensor: torch.Tensor) -> Image.Image:
+        """
+        Convertir tensor GPU->PIL minimizando copias:
+        - sincroniza GPU
+        - mueve a CPU non_blocking (requiere pinned memory para ser efectivo)
+        - hace contiguous una sola vez
+        - convierte a uint8 una sola vez
+        """
+        # Acepta [N,C,H,W] o [C,H,W]
+        t = tensor
+        if t.ndim == 4:
+            t = t[0]
+
+        # Asegurar que GPU terminó
+        if t.is_cuda:
+            torch.cuda.synchronize()
+
+        # Mover a CPU (non_blocking where possible) y hacer contiguous
+        cpu_t = t.detach().to("cpu", non_blocking=True).contiguous()
+
+        # Normalizar y convertir a uint8. Asumo rango [0,1]. Si tu pipeline devuelve [-1,1]
+        # usar: cpu_t = (cpu_t + 1) / 2
+        cpu_t = cpu_t.clamp(0, 1).mul(255).to(torch.uint8)
 
-        if isinstance(image, torch.Tensor):
-            from torchvision import transforms
-            to_pil = transforms.ToPILImage()
-            image = to_pil(image.squeeze(0).clamp(0, 1))
+        # reordenar a H,W,C y extraer numpy (una copia inevitable)
+        arr = cpu_t.permute(1, 2, 0).numpy()
 
+        pil = Image.fromarray(arr)
+
+        # cleanup variables intermedias (liberar memoria lo antes posible)
+        try:
+            del arr, cpu_t, t
+        except Exception:
+            pass
+
+        return pil
+
+    def save_image(self, image):
         filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
         image_path = os.path.join(self.image_dir, filename)
         logger.info(f"Saving image to {image_path}")
 
-        image.save(image_path, format="PNG", optimize=True)
+        try:
+            # Si ya es PIL, guardar directo
+            if isinstance(image, Image.Image):
+                image.save(image_path, format="PNG", optimize=True)
+                # liberar referencia
+                del image
+            else:
+                # Si tiene método to (posible tensor o wrapper), intentar mover a cpu primero (seguro)
+                if hasattr(image, "to") and isinstance(image, torch.Tensor):
+                    # Convertir tensor -> PIL minimizando copias
+                    pil = self._tensor_to_pil_minimal(image)
+                    # Guardar con lock/serialización (see usage in endpoint)
+                    pil.save(image_path, format="PNG", optimize=True)
+                    del pil
+                else:
+                    # Fallback: si no es tensor ni PIL, intenta convertir via torchvision
+                    try:
+                        from torchvision import transforms
+                        to_pil = transforms.ToPILImage()
+                        pil = to_pil(image.squeeze(0).clamp(0, 1))
+                        pil.save(image_path, format="PNG", optimize=True)
+                        del pil
+                    except Exception as e:
+                        raise RuntimeError(f"Unsupported image object for saving: {e}")
+
+            # cleanup agresivo
+            gc.collect()
+            if torch.cuda.is_available():
+                # sincronizar y limpiar caches GPU para evitar buffers retenidos
+                try:
+                    torch.cuda.synchronize()
+                except Exception:
+                    pass
+                torch.cuda.empty_cache()
 
-        del image
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
+            return os.path.join(self.service_url, "images", filename)
 
-        return os.path.join(self.service_url, "images", filename)
+        except Exception as e:
+            # intentar limpiar en caso de error
+            try:
+                del image
+            except Exception:
+                pass
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            logger.error(f"Error saving image: {e}")
+            raise
 
     def save_video(self, video, fps):
         filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4"
@@ -114,11 +193,11 @@ def save_video(self, video, fps):
 
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium'  
-    type_models: str = 't2im' 
+    model: str = 'stabilityai/stable-diffusion-3-medium'  # Valor predeterminado
+    type_models: str = 't2im'  # Solo hay t2im y t2v
     custom_model : bool = False
     constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None 
+    custom_pipeline: Optional[Type] = None  # Añadimos valor por defecto
     components: Optional[Dict[str, Any]] = None
     api_name: Optional[str] = 'custom_api'
     torch_dtype: Optional[torch.dtype] = None
@@ -139,6 +218,9 @@ async def lifespan(app: FastAPI):
         app.state.metrics_lock = asyncio.Lock()
         app.state.metrics_task = None
 
+        # Guardar modelo ya inicializado
+
+        # Inicializar utils
         app.state.utils_app = Utils(
             host=server_config.host,
             port=server_config.port,
@@ -157,6 +239,9 @@ async def metrics_loop():
                 raise
 
         app.state.metrics_task = asyncio.create_task(metrics_loop())
+        from concurrent.futures import ThreadPoolExecutor
+
+        app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
         try:
             yield
@@ -170,6 +255,7 @@ async def metrics_loop():
                 except asyncio.CancelledError:
                     pass
 
+            # Intentar liberar pipeline si tiene stop/close
             try:
                 stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
                 if callable(stop_fn):
@@ -210,7 +296,7 @@ async def metrics_loop():
         request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
         pipeline_lock = threading.Lock()
 
-    logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
+    logger.info(f"Pipeline inicializado y listo para recibir solicitudes (modelo={server_config.model})")
 
     app.state.MODEL_INITIALIZER = initializer
     app.state.MODEL_PIPELINE = model_pipeline
@@ -245,21 +331,18 @@ async def api(json: JSONBodyQueryAPI):
 
         wrapper     = app.state.MODEL_PIPELINE   
         initializer = app.state.MODEL_INITIALIZER
-
-        utils_app = app.state.utils_app
-
+        utils_app   = app.state.utils_app
+        req_pipe    = app.state.REQUEST_PIPE
 
         if not wrapper or not wrapper.pipeline:
-            raise HTTPException(500, "Model not initialized correctly")
+            raise HTTPException(500, "Modelo no inicializado correctamente")
         if not prompt.strip():
-            raise HTTPException(400, "No prompt provided")
+            raise HTTPException(400, "No se proporcionó prompt")
 
         def make_generator():
             g = torch.Generator(device=initializer.device)
             return g.manual_seed(random.randint(0, 10_000_000))
 
-        req_pipe = app.state.REQUEST_PIPE
-
         def infer():
             gen = make_generator()
             return req_pipe.generate(
@@ -277,20 +360,44 @@ def infer():
 
             output = await run_in_threadpool(infer)
 
+            saved_urls = []
+            loop = asyncio.get_running_loop()
+
+            images = getattr(output, "images", []) or []
+            for idx, img in enumerate(images):
+                try:
+                    url = await loop.run_in_executor(app.state.SAVE_EXECUTOR, utils_app.save_image, img)
+                    saved_urls.append(url)
+                except Exception as e:
+                    logger.error(f"Error guardando imagen {idx}: {e}")
+                finally:
+                    try:
+                        del img
+                    except Exception:
+                        pass
+                    import gc
+                    gc.collect()
+                    if torch.cuda.is_available():
+                        try:
+                            torch.cuda.synchronize()
+                        except Exception:
+                            pass
+                        torch.cuda.empty_cache()
+
             async with app.state.metrics_lock:
                 app.state.active_inferences = max(0, app.state.active_inferences - 1)
 
-            urls = [utils_app.save_image(img) for img in output.images]
-            return {"response": urls}
+            return {"response": saved_urls}
 
         except Exception as e:
             async with app.state.metrics_lock:
                 app.state.active_inferences = max(0, app.state.active_inferences - 1)
-            logger.error(f"Error during inference: {e}")
-            raise HTTPException(500, f"Error in processing: {e}")
+            logger.error(f"Error durante la inferencia: {e}")
+            raise HTTPException(500, f"Error en procesamiento: {e}")
 
         finally:
-            import gc; gc.collect()
+            import gc
+            gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
 
diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py
index 8f5064c1f04a..4e2bb9452c4a 100644
--- a/examples/server-async/DiffusersServer/superpipeline.py
+++ b/examples/server-async/DiffusersServer/superpipeline.py
@@ -11,6 +11,14 @@ def __init__(self, model_path: str,
                 pipeline: Type, 
                 torch_dtype = torch.bfloat16, 
                 components: Optional[Dict[str, Any]] = None,):
+        """
+        Clase para crear tus Pipelines personalizados para tu API custom
+        Args:
+            model_path: Ruta o nombre del modelo
+            pipeline: Clase del pipeline a utilizar
+            torch_dtype: Tipo de datos de PyTorch a utilizar
+            components: Diccionario de componentes personalizados
+        """
         self.model_path = model_path
         self.pipeline = pipeline
         self.torch_dtype = torch_dtype
diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py
index 437e4961f4d4..faefc5c2f0ee 100644
--- a/examples/server-async/DiffusersServer/uvicorn_diffu.py
+++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py
@@ -5,7 +5,9 @@
 import os
 import threading
 import time
+import string
 
+# Configuración de logging
 def setup_logging():
     logging.basicConfig(level=logging.INFO)
     return logging.getLogger('uvicorn')
@@ -13,10 +15,18 @@ def setup_logging():
 logger = setup_logging()
 
 def memory_cleanup(interval=30):
+    """
+    Función para monitorear y limpiar la memoria periódicamente
+    
+    Args:
+        interval (int): Intervalo en segundos entre limpiezas
+    """
     while True:
         try:
+            # Forzar recolección de basura
             gc.collect()
             
+            # Obtener información de memoria actual
             process = psutil.Process(os.getpid())
             mem = process.memory_info().rss / 1024 / 1024
             logger.info(f"Memoria en uso: {mem:.2f} MB")
@@ -38,9 +48,27 @@ def run_uvicorn_server(
     ],
     enable_memory_monitor=True
 ):
+    """
+    Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional
+    
+    Args:
+        app: Aplicación FastAPI
+        host (str): Host donde se servirá la aplicación
+        port (int): Puerto para el servidor
+        workers (int): Número de hilos para Uvicorn
+        cleanup_interval (int): Intervalo de limpieza para Uvicorn
+        channel_timeout (int): Tiempo de espera máximo para canales
+        server_header (bool): Activar el identificador / Header del servidor
+        headers (str): Identificador del servidor / Header del servidor
+        enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria
+        
+    Returns:
+        El resultado de serve() (aunque normalmente no retorna)
+    """
     gc.enable()
     gc.set_threshold(700, 10, 5)
     
+    # Iniciar monitoreo de memoria si está habilitado
     if enable_memory_monitor:
         cleanup_thread = threading.Thread(
             target=memory_cleanup, 
@@ -48,9 +76,9 @@ def run_uvicorn_server(
             daemon=True
         )
         cleanup_thread.start()
-        logger.info("Memory monitor activated")
+        logger.info("Monitor de memoria activado")
     
-    logger.info(f"Starting Uvicorn server in {host}:{port}...")
+    logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...")
 
     config = uvicorn.Config(
         app=app,
diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt
index b7a30ef45da8..d5a3746c235b 100644
--- a/examples/server-async/requirements.txt
+++ b/examples/server-async/requirements.txt
@@ -4,4 +4,4 @@ transformers
 sentencepiece 
 fastapi 
 uvicorn 
-fifty
\ No newline at end of file
+ftfy
\ No newline at end of file

From ed617fe154e2adf996be77dd6eef86f0d74e4a02 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Fri, 12 Sep 2025 21:07:19 -0600
Subject: [PATCH 16/34] Optimizations in examples/server-async

---
 .../server-async/DiffusersServer/Pipelines.py |  41 ++-
 .../DiffusersServer/serverasync.py            | 290 +++++++++---------
 examples/server-async/requirements.txt        |   5 +-
 3 files changed, 178 insertions(+), 158 deletions(-)

diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py
index 087c4cbd380a..60be11b2f241 100644
--- a/examples/server-async/DiffusersServer/Pipelines.py
+++ b/examples/server-async/DiffusersServer/Pipelines.py
@@ -42,7 +42,6 @@ def start(self):
             torch.backends.cudnn.deterministic = False
             torch.backends.cudnn.allow_tf32 = True
         
-        
         if torch.cuda.is_available():
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
             logger.info(f"Loading CUDA with model: {model_path}")
@@ -61,6 +60,14 @@ def start(self):
             
             self.pipeline = self.pipeline.to(device=self.device)
             
+            if hasattr(self.pipeline, 'enable_vae_slicing'):
+                self.pipeline.enable_vae_slicing()
+                logger.info("VAE slicing enabled - will reduce memory spikes during decoding")
+            
+            if hasattr(self.pipeline, 'enable_vae_tiling'):
+                self.pipeline.enable_vae_tiling()
+                logger.info("VAE tiling enabled - will allow processing larger images")
+            
             if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
                 self.pipeline.transformer = self.pipeline.transformer.to(
                     memory_format=torch.channels_last
@@ -71,6 +78,15 @@ def start(self):
                 self.pipeline.vae = self.pipeline.vae.to(
                     memory_format=torch.channels_last
                 )
+                
+                if hasattr(self.pipeline.vae, 'enable_slicing'):
+                    self.pipeline.vae.enable_slicing()
+                    logger.info("VAE slicing activated directly in the VAE")
+                
+                if hasattr(self.pipeline.vae, 'enable_tiling'):
+                    self.pipeline.vae.enable_tiling()
+                    logger.info("VAE tiling activated directly on the VAE")
+                
                 logger.info("VAE optimized with channels_last format")
             
             try:
@@ -79,9 +95,7 @@ def start(self):
             except Exception as e:
                 logger.info(f"XFormers not available: {e}")
             
-            # --- Se descarta torch.compile pero se mantiene el resto ---
-            if torch.__version__ >= "2.0.0":
-                logger.info("Skipping torch.compile - running without compile optimizations by design")
+            logger.info("Skipping torch.compile - running without compile optimizations by design")
             
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
@@ -92,13 +106,18 @@ def start(self):
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
             logger.info(f"Loading MPS for Mac M Series with model: {model_path}")
             self.device = "mps"
+            
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
                 model_path,
                 torch_dtype=torch.bfloat16,
                 use_safetensors=True,
                 low_cpu_mem_usage=True,
             ).to(device=self.device)
-
+            
+            if hasattr(self.pipeline, 'enable_vae_slicing'):
+                self.pipeline.enable_vae_slicing()
+                logger.info("VAE slicing enabled in MPS")
+            
             if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
                 self.pipeline.transformer = self.pipeline.transformer.to(
                     memory_format=torch.channels_last
@@ -108,14 +127,13 @@ def start(self):
                 self.pipeline.vae = self.pipeline.vae.to(
                     memory_format=torch.channels_last
                 )
-            
                 
             logger.info("MPS pipeline optimized and ready")
             
         else:
             raise Exception("No CUDA or MPS device available")
         
-        # OPTIONAL WARMUP
+
         self._warmup()
         
         logger.info("Pipeline initialization completed successfully")
@@ -131,8 +149,13 @@ def _warmup(self):
                     width=512,
                     guidance_scale=1.0,
                 )
-            torch.cuda.empty_cache() if self.device == "cuda" else None
-            logger.info("Warmup completed")
+            
+            if self.device == "cuda":
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+            
+            gc.collect()
+            logger.info("Warmup completed with memory cleanup")
 
 class TextToImagePipelineFlux:
     def __init__(self, model_path: str | None = None, low_vram: bool = False):
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index e7e056786c5d..a3392500d9f6 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -1,5 +1,3 @@
-# Voy a mudar todo el servidor a un servidor asincrono con FastAPI y Uvicorn
-# Mientras complete esto, el servidor actual sigue funcionando
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import FileResponse  
 from fastapi.middleware.cors import CORSMiddleware
@@ -26,16 +24,6 @@
 import asyncio
 from PIL import Image
 
-"""
-The goal is to create image generation, editing, and variance endpoints compatible with the OpenAI client.
-
-APIs:
-
-POST /images/variations (create_variation)
-POST /images/edits (edit)
-POST /images/generations (generate)
-"""
-
 @dataclass
 class PresetModels:
     SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
@@ -93,111 +81,114 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500):
         if not os.path.exists(self.video_dir):
             os.makedirs(self.video_dir)
 
-    def _tensor_to_pil_minimal(self, tensor: torch.Tensor) -> Image.Image:
-        """
-        Convertir tensor GPU->PIL minimizando copias:
-        - sincroniza GPU
-        - mueve a CPU non_blocking (requiere pinned memory para ser efectivo)
-        - hace contiguous una sola vez
-        - convierte a uint8 una sola vez
-        """
-        # Acepta [N,C,H,W] o [C,H,W]
-        t = tensor
-        if t.ndim == 4:
-            t = t[0]
-
-        # Asegurar que GPU terminó
-        if t.is_cuda:
-            torch.cuda.synchronize()
-
-        # Mover a CPU (non_blocking where possible) y hacer contiguous
-        cpu_t = t.detach().to("cpu", non_blocking=True).contiguous()
-
-        # Normalizar y convertir a uint8. Asumo rango [0,1]. Si tu pipeline devuelve [-1,1]
-        # usar: cpu_t = (cpu_t + 1) / 2
-        cpu_t = cpu_t.clamp(0, 1).mul(255).to(torch.uint8)
-
-        # reordenar a H,W,C y extraer numpy (una copia inevitable)
-        arr = cpu_t.permute(1, 2, 0).numpy()
-
-        pil = Image.fromarray(arr)
+        from concurrent.futures import ThreadPoolExecutor
+        self.executor = ThreadPoolExecutor(max_workers=2)
 
-        # cleanup variables intermedias (liberar memoria lo antes posible)
+    def _save_pil_image(self, pil_image: Image.Image, filepath: str):
         try:
-            del arr, cpu_t, t
-        except Exception:
-            pass
-
-        return pil
-
-    def save_image(self, image):
-        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
-        image_path = os.path.join(self.image_dir, filename)
-        logger.info(f"Saving image to {image_path}")
-
+            pil_image.save(filepath, format="PNG", optimize=True, compress_level=6)
+        except Exception as e:
+            logger.error(f"Error saving PIL image: {e}")
+            raise
+        finally:
+            if pil_image:
+                pil_image.close()
+                del pil_image
+    
+    def _tensor_to_pil_optimized(self, tensor: torch.Tensor) -> Image.Image:
+
+        with torch.no_grad():
+            tensor_cpu = tensor.detach().clone()
+            
+            if tensor_cpu.is_cuda:
+                tensor_cpu = tensor_cpu.cpu()
+                torch.cuda.synchronize()
+            
+            if tensor_cpu.dim() == 4:
+                tensor_cpu = tensor_cpu[0]
+            
+            tensor_cpu = tensor_cpu.clamp(0, 1).mul(255).byte()
+            
+            if tensor_cpu.shape[0] in [1, 3, 4]:  
+                tensor_cpu = tensor_cpu.permute(1, 2, 0)
+            
+            np_array = tensor_cpu.contiguous().numpy()
+
+            del tensor_cpu
+            
+            if np_array.shape[-1] == 1:
+                np_array = np_array.squeeze(-1)
+                mode = 'L'
+            elif np_array.shape[-1] == 3:
+                mode = 'RGB'
+            elif np_array.shape[-1] == 4:
+                mode = 'RGBA'
+            else:
+                raise ValueError(f"Unsupported number of channels: {np_array.shape[-1]}")
+            
+            pil_image = Image.fromarray(np_array, mode=mode)
+            
+            del np_array
+            
+            return pil_image
+    
+    async def save_image(self, image) -> str:
+        
+        image_id = str(uuid.uuid4()).split("-")[0]
+        filename = f"img{image_id}.png"
+        filepath = os.path.join(self.image_dir, filename)
+        url = os.path.join(self.service_url, "images", filename)
+        
+        loop = asyncio.get_event_loop()
+        
         try:
-            # Si ya es PIL, guardar directo
             if isinstance(image, Image.Image):
-                image.save(image_path, format="PNG", optimize=True)
-                # liberar referencia
-                del image
+                await loop.run_in_executor(
+                    self.executor,
+                    self._save_pil_image,
+                    image,
+                    filepath
+                )
+                
+            elif isinstance(image, torch.Tensor):
+                with torch.no_grad():
+                    pil_image = await loop.run_in_executor(
+                        None,
+                        self._tensor_to_pil_optimized,
+                        image
+                    )
+                    
+                    await loop.run_in_executor(
+                        self.executor,
+                        self._save_pil_image,
+                        pil_image,
+                        filepath
+                    )
+                    
+                    del pil_image
+                
             else:
-                # Si tiene método to (posible tensor o wrapper), intentar mover a cpu primero (seguro)
-                if hasattr(image, "to") and isinstance(image, torch.Tensor):
-                    # Convertir tensor -> PIL minimizando copias
-                    pil = self._tensor_to_pil_minimal(image)
-                    # Guardar con lock/serialización (see usage in endpoint)
-                    pil.save(image_path, format="PNG", optimize=True)
-                    del pil
-                else:
-                    # Fallback: si no es tensor ni PIL, intenta convertir via torchvision
-                    try:
-                        from torchvision import transforms
-                        to_pil = transforms.ToPILImage()
-                        pil = to_pil(image.squeeze(0).clamp(0, 1))
-                        pil.save(image_path, format="PNG", optimize=True)
-                        del pil
-                    except Exception as e:
-                        raise RuntimeError(f"Unsupported image object for saving: {e}")
-
-            # cleanup agresivo
-            gc.collect()
-            if torch.cuda.is_available():
-                # sincronizar y limpiar caches GPU para evitar buffers retenidos
-                try:
-                    torch.cuda.synchronize()
-                except Exception:
-                    pass
-                torch.cuda.empty_cache()
-
-            return os.path.join(self.service_url, "images", filename)
-
+                raise ValueError(f"Unsupported image type: {type(image)}")
+            
+            logger.debug(f"Image saved: {filename}")
+            return url
+            
         except Exception as e:
-            # intentar limpiar en caso de error
-            try:
-                del image
-            except Exception:
-                pass
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            logger.error(f"Error saving image: {e}")
+            logger.error(f"Error in save_image_optimized: {e}")
             raise
-
-    def save_video(self, video, fps):
-        filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4"
-        video_path = os.path.join(self.video_dir, filename)
-        export = export_to_video(video, video_path, fps=fps)
-        logger.info(f"Saving video to {video_path}")
-        return os.path.join(self.service_url, "video", filename)
+        finally:
+            gc.collect()
+    
+    def shutdown(self):
+        self.executor.shutdown(wait=True)
 
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium'  # Valor predeterminado
-    type_models: str = 't2im'  # Solo hay t2im y t2v
+    model: str = 'stabilityai/stable-diffusion-3-medium'  
+    type_models: str = 't2im'  
     custom_model : bool = False
     constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None  # Añadimos valor por defecto
+    custom_pipeline: Optional[Type] = None  
     components: Optional[Dict[str, Any]] = None
     api_name: Optional[str] = 'custom_api'
     torch_dtype: Optional[torch.dtype] = None
@@ -218,9 +209,6 @@ async def lifespan(app: FastAPI):
         app.state.metrics_lock = asyncio.Lock()
         app.state.metrics_task = None
 
-        # Guardar modelo ya inicializado
-
-        # Inicializar utils
         app.state.utils_app = Utils(
             host=server_config.host,
             port=server_config.port,
@@ -240,13 +228,11 @@ async def metrics_loop():
 
         app.state.metrics_task = asyncio.create_task(metrics_loop())
         from concurrent.futures import ThreadPoolExecutor
-
         app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
         try:
             yield
         finally:
-            # 🔻 shutdown
             task = app.state.metrics_task
             if task:
                 task.cancel()
@@ -255,7 +241,6 @@ async def metrics_loop():
                 except asyncio.CancelledError:
                     pass
 
-            # Intentar liberar pipeline si tiene stop/close
             try:
                 stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
                 if callable(stop_fn):
@@ -265,8 +250,6 @@ async def metrics_loop():
 
             app.state.logger.info("Lifespan shutdown complete")
 
-    
-
     app = FastAPI(lifespan=lifespan)
 
     logger = logging.getLogger("DiffusersServer.Pipelines")
@@ -296,7 +279,7 @@ async def metrics_loop():
         request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
         pipeline_lock = threading.Lock()
 
-    logger.info(f"Pipeline inicializado y listo para recibir solicitudes (modelo={server_config.model})")
+    logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
 
     app.state.MODEL_INITIALIZER = initializer
     app.state.MODEL_PIPELINE = model_pipeline
@@ -335,9 +318,9 @@ async def api(json: JSONBodyQueryAPI):
         req_pipe    = app.state.REQUEST_PIPE
 
         if not wrapper or not wrapper.pipeline:
-            raise HTTPException(500, "Modelo no inicializado correctamente")
+            raise HTTPException(500, "Model not initialized correctly")
         if not prompt.strip():
-            raise HTTPException(400, "No se proporcionó prompt")
+            raise HTTPException(400, "No prompt provided")
 
         def make_generator():
             g = torch.Generator(device=initializer.device)
@@ -345,45 +328,56 @@ def make_generator():
 
         def infer():
             gen = make_generator()
-            return req_pipe.generate(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                generator=gen,
-                num_inference_steps=num_steps,
-                num_images_per_prompt=num_images_per_prompt,
-                device=initializer.device
-            )
+            
+            # Maybe this will improve some performance (I'll test it)
+            with torch.no_grad():
+                output = req_pipe.generate(
+                    prompt=prompt,
+                    negative_prompt=negative_prompt,
+                    generator=gen,
+                    num_inference_steps=num_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=initializer.device
+                )
+            
+            return output
 
         try:
             async with app.state.metrics_lock:
                 app.state.active_inferences += 1
 
             output = await run_in_threadpool(infer)
-
-            saved_urls = []
-            loop = asyncio.get_running_loop()
-
+            
             images = getattr(output, "images", []) or []
-            for idx, img in enumerate(images):
+            
+            saved_urls = []
+            
+            for i, img in enumerate(images):
                 try:
-                    url = await loop.run_in_executor(app.state.SAVE_EXECUTOR, utils_app.save_image, img)
+
+                    url = await utils_app.save_image(img)
                     saved_urls.append(url)
-                except Exception as e:
-                    logger.error(f"Error guardando imagen {idx}: {e}")
-                finally:
-                    try:
-                        del img
-                    except Exception:
-                        pass
-                    import gc
-                    gc.collect()
+                    
+                    if isinstance(img, Image.Image):
+                        img.close()
+                    del img
+                    
                     if torch.cuda.is_available():
-                        try:
-                            torch.cuda.synchronize()
-                        except Exception:
-                            pass
-                        torch.cuda.empty_cache()
+                        torch.cuda.synchronize()
+                        
+                except Exception as e:
+                    logger.error(f"Error saving image {i}: {e}")
+                    continue
+            
 
+            del output, images
+            
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+            
+            gc.collect()
+            
             async with app.state.metrics_lock:
                 app.state.active_inferences = max(0, app.state.active_inferences - 1)
 
@@ -392,14 +386,14 @@ def infer():
         except Exception as e:
             async with app.state.metrics_lock:
                 app.state.active_inferences = max(0, app.state.active_inferences - 1)
-            logger.error(f"Error durante la inferencia: {e}")
-            raise HTTPException(500, f"Error en procesamiento: {e}")
+            logger.error(f"Error during inference: {e}")
+            raise HTTPException(500, f"Error in processing: {e}")
 
         finally:
-            import gc
-            gc.collect()
             if torch.cuda.is_available():
+                torch.cuda.synchronize()
                 torch.cuda.empty_cache()
+            gc.collect()
 
 
     @app.get("/images/{filename}")
diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt
index d5a3746c235b..aafa93b7023f 100644
--- a/examples/server-async/requirements.txt
+++ b/examples/server-async/requirements.txt
@@ -4,4 +4,7 @@ transformers
 sentencepiece 
 fastapi 
 uvicorn 
-ftfy
\ No newline at end of file
+ftfy
+accelerate
+xformers
+protobuf
\ No newline at end of file

From b052d27fd7390d30a79d51f9a8b55b62d154b36e Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 13 Sep 2025 22:04:14 -0600
Subject: [PATCH 17/34] We keep the implementation simple in
 examples/server-async

---
 .../server-async/DiffusersServer/Pipelines.py | 146 +------------
 .../server-async/DiffusersServer/__init__.py  |   1 -
 .../DiffusersServer/serverasync.py            | 195 ++++--------------
 .../DiffusersServer/superpipeline.py          |  50 -----
 .../DiffusersServer/uvicorn_diffu.py          |  36 +---
 5 files changed, 58 insertions(+), 370 deletions(-)
 delete mode 100644 examples/server-async/DiffusersServer/superpipeline.py

diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py
index 60be11b2f241..bc60d4811c3e 100644
--- a/examples/server-async/DiffusersServer/Pipelines.py
+++ b/examples/server-async/DiffusersServer/Pipelines.py
@@ -1,5 +1,4 @@
 # Pipelines.py
-
 from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
@@ -7,7 +6,6 @@
 import os
 import logging
 from pydantic import BaseModel
-import gc
 
 logger = logging.getLogger(__name__)
 
@@ -22,155 +20,36 @@ def __init__(self, model_path: str | None = None):
         self.model_path = model_path or os.getenv("MODEL_PATH")
         self.pipeline: StableDiffusion3Pipeline | None = None
         self.device: str | None = None
-        
+
     def start(self):
-        torch.set_float32_matmul_precision("high")
-        
-        if hasattr(torch._inductor, 'config'):
-            if hasattr(torch._inductor.config, 'conv_1x1_as_mm'):
-                torch._inductor.config.conv_1x1_as_mm = True
-            if hasattr(torch._inductor.config, 'coordinate_descent_tuning'):
-                torch._inductor.config.coordinate_descent_tuning = True
-            if hasattr(torch._inductor.config, 'epilogue_fusion'):
-                torch._inductor.config.epilogue_fusion = False
-            if hasattr(torch._inductor.config, 'coordinate_descent_check_all_directions'):
-                torch._inductor.config.coordinate_descent_check_all_directions = True
-        
-        if torch.cuda.is_available():
-            torch.backends.cudnn.benchmark = True
-            torch.backends.cuda.matmul.allow_tf32 = True
-            torch.backends.cudnn.deterministic = False
-            torch.backends.cudnn.allow_tf32 = True
-        
         if torch.cuda.is_available():
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
-            logger.info(f"Loading CUDA with model: {model_path}")
+            logger.info("Loading CUDA")
             self.device = "cuda"
-            
-            torch.cuda.empty_cache()
-            gc.collect()
-            
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
                 model_path,
                 torch_dtype=torch.float16,
-                use_safetensors=True,
-                variant="fp16" if "fp16" in model_path else None,
-                low_cpu_mem_usage=True,
-            )
-            
-            self.pipeline = self.pipeline.to(device=self.device)
-            
-            if hasattr(self.pipeline, 'enable_vae_slicing'):
-                self.pipeline.enable_vae_slicing()
-                logger.info("VAE slicing enabled - will reduce memory spikes during decoding")
-            
-            if hasattr(self.pipeline, 'enable_vae_tiling'):
-                self.pipeline.enable_vae_tiling()
-                logger.info("VAE tiling enabled - will allow processing larger images")
-            
-            if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
-                self.pipeline.transformer = self.pipeline.transformer.to(
-                    memory_format=torch.channels_last
-                )
-                logger.info("Transformer optimized with channels_last format")
-            
-            if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None:
-                self.pipeline.vae = self.pipeline.vae.to(
-                    memory_format=torch.channels_last
-                )
-                
-                if hasattr(self.pipeline.vae, 'enable_slicing'):
-                    self.pipeline.vae.enable_slicing()
-                    logger.info("VAE slicing activated directly in the VAE")
-                
-                if hasattr(self.pipeline.vae, 'enable_tiling'):
-                    self.pipeline.vae.enable_tiling()
-                    logger.info("VAE tiling activated directly on the VAE")
-                
-                logger.info("VAE optimized with channels_last format")
-            
-            try:
-                self.pipeline.enable_xformers_memory_efficient_attention()
-                logger.info("XFormers memory efficient attention enabled")
-            except Exception as e:
-                logger.info(f"XFormers not available: {e}")
-            
-            logger.info("Skipping torch.compile - running without compile optimizations by design")
-            
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            
-            logger.info("CUDA pipeline fully optimized and ready")
-            
+            ).to(device=self.device)
         elif torch.backends.mps.is_available():
             model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
-            logger.info(f"Loading MPS for Mac M Series with model: {model_path}")
+            logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
-            
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
                 model_path,
                 torch_dtype=torch.bfloat16,
-                use_safetensors=True,
-                low_cpu_mem_usage=True,
             ).to(device=self.device)
-            
-            if hasattr(self.pipeline, 'enable_vae_slicing'):
-                self.pipeline.enable_vae_slicing()
-                logger.info("VAE slicing enabled in MPS")
-            
-            if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None:
-                self.pipeline.transformer = self.pipeline.transformer.to(
-                    memory_format=torch.channels_last
-                )
-            
-            if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None:
-                self.pipeline.vae = self.pipeline.vae.to(
-                    memory_format=torch.channels_last
-                )
-                
-            logger.info("MPS pipeline optimized and ready")
-            
         else:
             raise Exception("No CUDA or MPS device available")
-        
-
-        self._warmup()
-        
-        logger.info("Pipeline initialization completed successfully")
-    
-    def _warmup(self):
-        if self.pipeline:
-            logger.info("Running warmup inference...")
-            with torch.no_grad():
-                _ = self.pipeline(
-                    prompt="warmup",
-                    num_inference_steps=1,
-                    height=512,
-                    width=512,
-                    guidance_scale=1.0,
-                )
-            
-            if self.device == "cuda":
-                torch.cuda.synchronize()
-                torch.cuda.empty_cache()
-            
-            gc.collect()
-            logger.info("Warmup completed with memory cleanup")
 
 class TextToImagePipelineFlux:
     def __init__(self, model_path: str | None = None, low_vram: bool = False):
-        """
-        Inicialización de la clase con la ruta del modelo.
-        Si no se proporciona, se obtiene de la variable de entorno.
-        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: FluxPipeline = None
-        self.device: str = None
+        self.pipeline: FluxPipeline | None = None
+        self.device: str | None = None
         self.low_vram = low_vram
 
     def start(self):
         if torch.cuda.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -183,7 +62,6 @@ def start(self):
             else:
                 pass
         elif torch.backends.mps.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
@@ -196,17 +74,12 @@ def start(self):
 
 class TextToImagePipelineSD:
     def __init__(self, model_path: str | None = None):
-        """
-        Inicialización de la clase con la ruta del modelo.
-        Si no se proporciona, se obtiene de la variable de entorno.
-        """
         self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: StableDiffusionPipeline = None
-        self.device: str = None
+        self.pipeline: StableDiffusionPipeline | None = None
+        self.device: str | None = None
 
     def start(self):
         if torch.cuda.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para CUDA.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading CUDA")
             self.device = "cuda" 
@@ -215,7 +88,6 @@ def start(self):
                 torch_dtype=torch.float16,
             ).to(device=self.device)
         elif torch.backends.mps.is_available():
-            # Si no se definió model_path, se asigna el valor por defecto para MPS.
             model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
             logger.info("Loading MPS for Mac M Series")
             self.device = "mps"
@@ -224,4 +96,4 @@ def start(self):
                 torch_dtype=torch.float16,
             ).to(device=self.device)
         else:
-            raise Exception("No CUDA or MPS device available")
\ No newline at end of file
+            raise Exception("No CUDA or MPS device available")
diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py
index d4dc75b71a1f..0d8d5761a939 100644
--- a/examples/server-async/DiffusersServer/__init__.py
+++ b/examples/server-async/DiffusersServer/__init__.py
@@ -1,3 +1,2 @@
 from .Pipelines import TextToImagePipelineSD3
-from .superpipeline import SuperPipelinesT2Img
 from .create_server import create_inference_server_Async as DiffusersServerApp
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index a3392500d9f6..ff0e64080d81 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -5,7 +5,6 @@
 from pydantic import BaseModel
 from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD
 import logging
-from diffusers.utils.export_utils import export_to_video
 from diffusers.pipelines.pipeline_utils import RequestScopedPipeline
 from diffusers import *
 from .superpipeline import *
@@ -22,7 +21,6 @@
 from typing import List
 from contextlib import asynccontextmanager
 import asyncio
-from PIL import Image
 
 @dataclass
 class PresetModels:
@@ -81,106 +79,30 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500):
         if not os.path.exists(self.video_dir):
             os.makedirs(self.video_dir)
 
-        from concurrent.futures import ThreadPoolExecutor
-        self.executor = ThreadPoolExecutor(max_workers=2)
+    def save_image(self, image):
+        if hasattr(image, "to"):
+            try:
+                image = image.to("cpu")
+            except Exception:
+                pass
 
-    def _save_pil_image(self, pil_image: Image.Image, filepath: str):
-        try:
-            pil_image.save(filepath, format="PNG", optimize=True, compress_level=6)
-        except Exception as e:
-            logger.error(f"Error saving PIL image: {e}")
-            raise
-        finally:
-            if pil_image:
-                pil_image.close()
-                del pil_image
-    
-    def _tensor_to_pil_optimized(self, tensor: torch.Tensor) -> Image.Image:
-
-        with torch.no_grad():
-            tensor_cpu = tensor.detach().clone()
-            
-            if tensor_cpu.is_cuda:
-                tensor_cpu = tensor_cpu.cpu()
-                torch.cuda.synchronize()
-            
-            if tensor_cpu.dim() == 4:
-                tensor_cpu = tensor_cpu[0]
-            
-            tensor_cpu = tensor_cpu.clamp(0, 1).mul(255).byte()
-            
-            if tensor_cpu.shape[0] in [1, 3, 4]:  
-                tensor_cpu = tensor_cpu.permute(1, 2, 0)
-            
-            np_array = tensor_cpu.contiguous().numpy()
-
-            del tensor_cpu
-            
-            if np_array.shape[-1] == 1:
-                np_array = np_array.squeeze(-1)
-                mode = 'L'
-            elif np_array.shape[-1] == 3:
-                mode = 'RGB'
-            elif np_array.shape[-1] == 4:
-                mode = 'RGBA'
-            else:
-                raise ValueError(f"Unsupported number of channels: {np_array.shape[-1]}")
-            
-            pil_image = Image.fromarray(np_array, mode=mode)
-            
-            del np_array
-            
-            return pil_image
-    
-    async def save_image(self, image) -> str:
-        
-        image_id = str(uuid.uuid4()).split("-")[0]
-        filename = f"img{image_id}.png"
-        filepath = os.path.join(self.image_dir, filename)
-        url = os.path.join(self.service_url, "images", filename)
-        
-        loop = asyncio.get_event_loop()
-        
-        try:
-            if isinstance(image, Image.Image):
-                await loop.run_in_executor(
-                    self.executor,
-                    self._save_pil_image,
-                    image,
-                    filepath
-                )
-                
-            elif isinstance(image, torch.Tensor):
-                with torch.no_grad():
-                    pil_image = await loop.run_in_executor(
-                        None,
-                        self._tensor_to_pil_optimized,
-                        image
-                    )
-                    
-                    await loop.run_in_executor(
-                        self.executor,
-                        self._save_pil_image,
-                        pil_image,
-                        filepath
-                    )
-                    
-                    del pil_image
-                
-            else:
-                raise ValueError(f"Unsupported image type: {type(image)}")
-            
-            logger.debug(f"Image saved: {filename}")
-            return url
-            
-        except Exception as e:
-            logger.error(f"Error in save_image_optimized: {e}")
-            raise
-        finally:
-            gc.collect()
-    
-    def shutdown(self):
-        self.executor.shutdown(wait=True)
+        if isinstance(image, torch.Tensor):
+            from torchvision import transforms
+            to_pil = transforms.ToPILImage()
+            image = to_pil(image.squeeze(0).clamp(0, 1))
+
+        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
+        image_path = os.path.join(self.image_dir, filename)
+        logger.info(f"Saving image to {image_path}")
+
+        image.save(image_path, format="PNG", optimize=True)
+
+        del image
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        return os.path.join(self.service_url, "images", filename)
 
 @dataclass
 class ServerConfigModels:
@@ -203,6 +125,8 @@ def create_app_fastapi(config: ServerConfigModels) -> FastAPI:
     async def lifespan(app: FastAPI):
         logging.basicConfig(level=logging.INFO)
         app.state.logger = logging.getLogger("diffusers-server")
+        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
+        os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
 
         app.state.total_requests = 0
         app.state.active_inferences = 0
@@ -227,8 +151,6 @@ async def metrics_loop():
                 raise
 
         app.state.metrics_task = asyncio.create_task(metrics_loop())
-        from concurrent.futures import ThreadPoolExecutor
-        app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
         try:
             yield
@@ -314,74 +236,45 @@ async def api(json: JSONBodyQueryAPI):
 
         wrapper     = app.state.MODEL_PIPELINE   
         initializer = app.state.MODEL_INITIALIZER
-        utils_app   = app.state.utils_app
-        req_pipe    = app.state.REQUEST_PIPE
+
+        utils_app = app.state.utils_app
+
 
         if not wrapper or not wrapper.pipeline:
             raise HTTPException(500, "Model not initialized correctly")
         if not prompt.strip():
             raise HTTPException(400, "No prompt provided")
 
+
         def make_generator():
             g = torch.Generator(device=initializer.device)
             return g.manual_seed(random.randint(0, 10_000_000))
 
+        req_pipe = app.state.REQUEST_PIPE
+
         def infer():
             gen = make_generator()
-            
-            # Maybe this will improve some performance (I'll test it)
-            with torch.no_grad():
-                output = req_pipe.generate(
-                    prompt=prompt,
-                    negative_prompt=negative_prompt,
-                    generator=gen,
-                    num_inference_steps=num_steps,
-                    num_images_per_prompt=num_images_per_prompt,
-                    device=initializer.device
-                )
-            
-            return output
+            return req_pipe.generate(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                generator=gen,
+                num_inference_steps=num_steps,
+                num_images_per_prompt=num_images_per_prompt,
+                device=initializer.device,
+                output_type="pil",
+            )
 
         try:
             async with app.state.metrics_lock:
                 app.state.active_inferences += 1
 
             output = await run_in_threadpool(infer)
-            
-            images = getattr(output, "images", []) or []
-            
-            saved_urls = []
-            
-            for i, img in enumerate(images):
-                try:
 
-                    url = await utils_app.save_image(img)
-                    saved_urls.append(url)
-                    
-                    if isinstance(img, Image.Image):
-                        img.close()
-                    del img
-                    
-                    if torch.cuda.is_available():
-                        torch.cuda.synchronize()
-                        
-                except Exception as e:
-                    logger.error(f"Error saving image {i}: {e}")
-                    continue
-            
-
-            del output, images
-            
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-                torch.cuda.empty_cache()
-            
-            gc.collect()
-            
             async with app.state.metrics_lock:
                 app.state.active_inferences = max(0, app.state.active_inferences - 1)
-
-            return {"response": saved_urls}
+                
+            urls = [utils_app.save_image(img) for img in output.images]
+            return {"response": urls}
 
         except Exception as e:
             async with app.state.metrics_lock:
@@ -393,6 +286,8 @@ def infer():
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
                 torch.cuda.empty_cache()
+                torch.cuda.reset_peak_memory_stats()
+                torch.cuda.ipc_collect()
             gc.collect()
 
 
diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py
deleted file mode 100644
index 4e2bb9452c4a..000000000000
--- a/examples/server-async/DiffusersServer/superpipeline.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from diffusers.pipelines import *
-from diffusers  import *
-import torch
-from typing import Optional, Dict, Any, Type
-import logging
-
-logger = logging.getLogger(__name__)
-
-class SuperPipelinesT2Img:
-    def __init__(self, model_path: str, 
-                pipeline: Type, 
-                torch_dtype = torch.bfloat16, 
-                components: Optional[Dict[str, Any]] = None,):
-        """
-        Clase para crear tus Pipelines personalizados para tu API custom
-        Args:
-            model_path: Ruta o nombre del modelo
-            pipeline: Clase del pipeline a utilizar
-            torch_dtype: Tipo de datos de PyTorch a utilizar
-            components: Diccionario de componentes personalizados
-        """
-        self.model_path = model_path
-        self.pipeline = pipeline
-        self.torch_dtype = torch_dtype
-        self.components = components or {}
-        self.device: str = None
-    
-    def start(self):
-        if torch.cuda.is_available():
-            logger.info("Loading CUDA")
-            model_path = self.model_path
-            self.device = 'cuda'
-            self.pipeline = self.pipeline.from_pretrained(
-                    model_path,
-                    torch_dtype = self.torch_dtype,
-                    ** self.components
-                ).to(device=self.device)
-        elif torch.backends.mps.is_available():
-            logger.info("Loading MPS for Mac M Series")
-            model_path = self.model_path
-            self.device = 'mps'
-            self.pipeline = self.pipeline.from_pretrained(
-                    model_path,
-                    torch_dtype = self.torch_dtype,
-                    **self.components
-                ).to(device=self.device)
-        else:
-            raise Exception("No CUDA or MPS device available")
-        
-        return self
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py
index faefc5c2f0ee..c2688e25497d 100644
--- a/examples/server-async/DiffusersServer/uvicorn_diffu.py
+++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py
@@ -5,9 +5,7 @@
 import os
 import threading
 import time
-import string
 
-# Configuración de logging
 def setup_logging():
     logging.basicConfig(level=logging.INFO)
     return logging.getLogger('uvicorn')
@@ -15,25 +13,17 @@ def setup_logging():
 logger = setup_logging()
 
 def memory_cleanup(interval=30):
-    """
-    Función para monitorear y limpiar la memoria periódicamente
-    
-    Args:
-        interval (int): Intervalo en segundos entre limpiezas
-    """
     while True:
         try:
-            # Forzar recolección de basura
             gc.collect()
             
-            # Obtener información de memoria actual
             process = psutil.Process(os.getpid())
             mem = process.memory_info().rss / 1024 / 1024
-            logger.info(f"Memoria en uso: {mem:.2f} MB")
+            logger.info(f"Memory in use: {mem:.2f} MB")
             
             time.sleep(interval)
         except Exception as e:
-            logger.error(f"Error en limpieza de memoria: {str(e)}")
+            logger.error(f"Memory clearing error: {str(e)}")
             time.sleep(interval)
 
 def run_uvicorn_server(
@@ -48,27 +38,9 @@ def run_uvicorn_server(
     ],
     enable_memory_monitor=True
 ):
-    """
-    Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional
-    
-    Args:
-        app: Aplicación FastAPI
-        host (str): Host donde se servirá la aplicación
-        port (int): Puerto para el servidor
-        workers (int): Número de hilos para Uvicorn
-        cleanup_interval (int): Intervalo de limpieza para Uvicorn
-        channel_timeout (int): Tiempo de espera máximo para canales
-        server_header (bool): Activar el identificador / Header del servidor
-        headers (str): Identificador del servidor / Header del servidor
-        enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria
-        
-    Returns:
-        El resultado de serve() (aunque normalmente no retorna)
-    """
     gc.enable()
     gc.set_threshold(700, 10, 5)
     
-    # Iniciar monitoreo de memoria si está habilitado
     if enable_memory_monitor:
         cleanup_thread = threading.Thread(
             target=memory_cleanup, 
@@ -76,9 +48,9 @@ def run_uvicorn_server(
             daemon=True
         )
         cleanup_thread.start()
-        logger.info("Monitor de memoria activado")
+        logger.info("Memory monitor activated")
     
-    logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...")
+    logger.info(f"Starting Uvicorn server in {host}:{port}...")
 
     config = uvicorn.Config(
         app=app,

From 0f63f4d4362a769e082a37e0a8e344eb427985ab Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 13 Sep 2025 22:06:24 -0600
Subject: [PATCH 18/34] Update examples/server-async/README.md

---
 examples/server-async/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index 10b4c1825098..43b86d52442d 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -1,6 +1,7 @@
 # Asynchronous server and parallel execution of models
 
 > Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork.
+> We recommend running about 10 to 50 inferences in parallel to have a good performance of 25-30s to 1-1:30min on average
 
 ## ⚠️ IMPORTANT
 

From a9666b11fca95b24318893054bd384c37cc3126d Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sat, 13 Sep 2025 22:16:42 -0600
Subject: [PATCH 19/34] Update examples/server-async/README.md for changes to
 tokenizer locks and backward-compatible retrieve_timesteps

---
 examples/server-async/README.md | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index 43b86d52442d..edf07852c247 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -14,16 +14,14 @@
 All the components needed to create the inference server are in `DiffusersServer/`
 
 ```
-DiffusersServer/                 # the example server package
-├── __init__.py                   
+DiffusersServer/
+├── **init**.py
 ├── create_server.py             # helper script to build/run the app programmatically
 ├── Pipelines.py                 # pipeline loader classes (SD3, Flux, legacy SD, video)
-├── serverasync.py               # FastAPI app factory (create_app_fastapi)
-├── superpipeline.py             # optional custom pipeline glue code
+├── serverasync.py               # FastAPI app factory (create\_app\_fastapi)
 ├── uvicorn_diffu.py             # convenience script to start uvicorn with recommended flags
 ```
 
-
 ## What `diffusers-async` adds / Why we needed it
 
 Core problem: a naive server that calls `pipe.__call__` concurrently can hit **race conditions** (e.g., `scheduler.set_timesteps` mutates shared state) or explode memory by deep-copying the whole pipeline per-request.
@@ -32,7 +30,8 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r
 
 * **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*.
 * **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics.
-* **`retrieve_timesteps(..., return_scheduler=True)`**: retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. This is the safe path for getting a scheduler configured per-request.
+* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur.
+* **`retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API.
 * **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors.
 
 ## How the server works (high-level flow)
@@ -51,7 +50,6 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r
 3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`).
 4. Multiple requests can run in parallel while sharing heavy weights and isolating mutable state.
 
-
 ## How to set up and run the server
 
 ### 1) Install dependencies
@@ -65,7 +63,7 @@ If using the `diffusers` fork via git, either:
 ```bash
 pip install "git+https://github.com/F4k3r22/diffusers-async.git@main"
 pip install -r requirements.txt
-```
+````
 
 ### 2) Start the server
 
@@ -97,17 +95,14 @@ Response example:
 
 ## Troubleshooting (quick)
 
-* `Already borrowed` — tokenizers (Rust) error when used concurrently.
+* `Already borrowed` — previously a Rust tokenizer concurrency error.
+  ✅ This is now fixed: `RequestScopedPipeline` manages an internal tokenizer lock so race conditions no longer happen.
 
-  * Workarounds:
-
-    * Acquire a `Lock` around tokenization or around the pipeline call (serializes that part).
-    * Use the slow tokenizer (`converter_to_slow`) for concurrency tests.
-    * Patch only the tokenization method to use a lock instead of serializing entire forward.
 * `can't set attribute 'components'` — pipeline exposes read-only `components`.
 
   * The RequestScopedPipeline now detects read-only properties and skips setting them.
+
 * Scheduler issues:
 
   * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
-
+  * ✅ Note: `retrieve_timesteps` is fully retro-compatible — if you don’t pass `return_scheduler=True`, the behavior is unchanged.

From 06bb13644174125eb79f83c7692e6e034d5e737a Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 14 Sep 2025 18:39:06 -0600
Subject: [PATCH 20/34] The changes to the diffusers core have been undone and
 all logic is being moved to exmaples/server-async

---
 .../DiffusersServer/serverasync.py            |   5 +-
 examples/server-async/utils/__init__.py       |   1 +
 .../utils/requestscopedpipeline.py            | 266 +++++++++++++++++
 examples/server-async/utils/scheduler.py      | 118 ++++++++
 src/diffusers/pipelines/flux/pipeline_flux.py |  80 ++---
 src/diffusers/pipelines/pipeline_utils.py     | 276 +-----------------
 .../pipeline_stable_diffusion.py              |  81 ++---
 .../pipeline_stable_diffusion_3.py            |  81 ++---
 .../pipeline_stable_diffusion_xl.py           |  81 ++---
 .../pipeline_stable_diffusion_adapter.py      |  80 ++---
 .../pipeline_stable_diffusion_xl_adapter.py   |  80 ++---
 src/diffusers/schedulers/scheduling_amused.py |   5 -
 .../scheduling_consistency_decoder.py         |   6 +-
 .../scheduling_consistency_models.py          |   5 -
 .../scheduling_cosine_dpmsolver_multistep.py  |   5 -
 src/diffusers/schedulers/scheduling_ddim.py   |   5 -
 .../schedulers/scheduling_ddim_cogvideox.py   |   5 -
 .../schedulers/scheduling_ddim_inverse.py     |   5 -
 .../schedulers/scheduling_ddim_parallel.py    |   6 -
 src/diffusers/schedulers/scheduling_ddpm.py   |   5 -
 .../schedulers/scheduling_ddpm_parallel.py    |   5 -
 .../schedulers/scheduling_ddpm_wuerstchen.py  |   5 -
 .../schedulers/scheduling_deis_multistep.py   |   5 -
 .../schedulers/scheduling_dpm_cogvideox.py    |   5 -
 .../scheduling_dpmsolver_multistep.py         |   6 -
 .../scheduling_dpmsolver_multistep_inverse.py |   5 -
 .../schedulers/scheduling_dpmsolver_sde.py    |   5 -
 .../scheduling_dpmsolver_singlestep.py        |   5 -
 .../scheduling_edm_dpmsolver_multistep.py     |   5 -
 .../schedulers/scheduling_edm_euler.py        |   5 -
 .../scheduling_euler_ancestral_discrete.py    |   5 -
 .../schedulers/scheduling_euler_discrete.py   |   5 -
 .../scheduling_flow_match_euler_discrete.py   |   5 -
 src/diffusers/schedulers/scheduling_sde_ve.py |   4 -
 src/diffusers/schedulers/scheduling_tcd.py    |   5 -
 src/diffusers/schedulers/scheduling_unclip.py |   5 -
 .../schedulers/scheduling_unipc_multistep.py  |   5 -
 .../schedulers/scheduling_vq_diffusion.py     |   5 -
 38 files changed, 498 insertions(+), 788 deletions(-)
 create mode 100644 examples/server-async/utils/__init__.py
 create mode 100644 examples/server-async/utils/requestscopedpipeline.py
 create mode 100644 examples/server-async/utils/scheduler.py

diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index ff0e64080d81..61eb99c3fdce 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -3,11 +3,10 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.concurrency import run_in_threadpool
 from pydantic import BaseModel
-from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD
+from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger
 import logging
-from diffusers.pipelines.pipeline_utils import RequestScopedPipeline
+from ..utils import RequestScopedPipeline
 from diffusers import *
-from .superpipeline import *
 import random
 import uuid
 import tempfile
diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py
new file mode 100644
index 000000000000..38b01f7aa59d
--- /dev/null
+++ b/examples/server-async/utils/__init__.py
@@ -0,0 +1 @@
+from .requestscopedpipeline import RequestScopedPipeline
\ No newline at end of file
diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
new file mode 100644
index 000000000000..56f5626ed156
--- /dev/null
+++ b/examples/server-async/utils/requestscopedpipeline.py
@@ -0,0 +1,266 @@
+from typing import Optional, Any, Iterable, List
+import copy
+import threading
+import torch
+from diffusers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+def safe_tokenize(tokenizer, *args, lock, **kwargs):
+    with lock:
+        return tokenizer(*args, **kwargs)
+
+class RequestScopedPipeline:
+    DEFAULT_MUTABLE_ATTRS = [
+        "_all_hooks",
+        "_offload_device",
+        "_progress_bar_config",
+        "_progress_bar",
+        "_rng_state",
+        "_last_seed",
+        "latents",
+    ]
+
+    def __init__(
+        self,
+        pipeline: Any,
+        mutable_attrs: Optional[Iterable[str]] = None,
+        auto_detect_mutables: bool = True,
+        tensor_numel_threshold: int = 1_000_000,
+        tokenizer_lock: Optional[threading.Lock] = None
+    ):
+        self._base = pipeline
+        self.unet = getattr(pipeline, "unet", None)
+        self.vae = getattr(pipeline, "vae", None)
+        self.text_encoder = getattr(pipeline, "text_encoder", None)
+        self.components = getattr(pipeline, "components", None)
+
+        self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
+        self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
+
+        self._auto_detect_mutables = bool(auto_detect_mutables)
+        self._tensor_numel_threshold = int(tensor_numel_threshold)
+
+        self._auto_detected_attrs: List[str] = []
+
+    def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
+        base_sched = getattr(self._base, "scheduler", None)
+        if base_sched is None:
+            return None
+
+        if hasattr(base_sched, "clone_for_request"):
+            try:
+                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs)
+            except Exception as e:
+                logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
+
+        try:
+            return copy.deepcopy(base_sched)
+        except Exception as e:
+            logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
+            return base_sched  
+
+    def _autodetect_mutables(self, max_attrs: int = 40):
+        if not self._auto_detect_mutables:
+            return []
+
+        if self._auto_detected_attrs:
+            return self._auto_detected_attrs
+
+        candidates: List[str] = []
+        seen = set()
+        for name in dir(self._base):
+            if name.startswith("__"):
+                continue
+            if name in self._mutable_attrs:
+                continue
+            if name in ("to", "save_pretrained", "from_pretrained"):
+                continue
+            try:
+                val = getattr(self._base, name)
+            except Exception:
+                continue
+
+            import types
+
+            # skip callables and modules
+            if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)):
+                continue
+
+            # containers -> candidate
+            if isinstance(val, (dict, list, set, tuple, bytearray)):
+                candidates.append(name)
+                seen.add(name)
+            else:
+                # try Tensor detection
+                try:
+                    if isinstance(val, torch.Tensor):
+                        if val.numel() <= self._tensor_numel_threshold:
+                            candidates.append(name)
+                            seen.add(name)
+                        else:
+                            logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}")
+                except Exception:
+                    continue
+
+            if len(candidates) >= max_attrs:
+                break
+
+        self._auto_detected_attrs = candidates
+        logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}")
+        return self._auto_detected_attrs
+
+    def _is_readonly_property(self, base_obj, attr_name: str) -> bool:
+        try:
+            cls = type(base_obj)
+            descriptor = getattr(cls, attr_name, None)
+            if isinstance(descriptor, property):
+                return descriptor.fset is None
+            if hasattr(descriptor, "__set__") is False and descriptor is not None:
+                return False
+        except Exception:
+            pass
+        return False
+
+    def _clone_mutable_attrs(self, base, local):
+        attrs_to_clone = list(self._mutable_attrs)
+        attrs_to_clone.extend(self._autodetect_mutables())
+
+        EXCLUDE_ATTRS = {"components",}
+
+        for attr in attrs_to_clone:
+            if attr in EXCLUDE_ATTRS:
+                logger.debug(f"Skipping excluded attr '{attr}'")
+                continue
+            if not hasattr(base, attr):
+                continue
+            if self._is_readonly_property(base, attr):
+                logger.debug(f"Skipping read-only property '{attr}'")
+                continue
+
+            try:
+                val = getattr(base, attr)
+            except Exception as e:
+                logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}")
+                continue
+
+            try:
+                if isinstance(val, dict):
+                    setattr(local, attr, dict(val))
+                elif isinstance(val, (list, tuple, set)):
+                    setattr(local, attr, list(val))
+                elif isinstance(val, bytearray):
+                    setattr(local, attr, bytearray(val))
+                else:
+                    # small tensors or atomic values
+                    if isinstance(val, torch.Tensor):
+                        if val.numel() <= self._tensor_numel_threshold:
+                            setattr(local, attr, val.clone())
+                        else:
+                            # don't clone big tensors, keep reference
+                            setattr(local, attr, val)
+                    else:
+                        try:
+                            setattr(local, attr, copy.copy(val))
+                        except Exception:
+                            setattr(local, attr, val)
+            except (AttributeError, TypeError) as e:
+                logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}")
+                continue
+            except Exception as e:
+                logger.debug(f"Unexpected error cloning attribute '{attr}': {e}")
+                continue
+
+    def _is_tokenizer_component(self, component) -> bool:
+        if component is None:
+            return False
+        
+        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
+        has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
+        
+        class_name = component.__class__.__name__.lower()
+        has_tokenizer_in_name = 'tokenizer' in class_name
+        
+        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
+        has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
+        
+        return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
+
+    def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
+        local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
+
+        try:
+            local_pipe = copy.copy(self._base)
+        except Exception as e:
+            logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
+            local_pipe = copy.deepcopy(self._base)
+
+        if local_scheduler is not None:
+            try:
+                setattr(local_pipe, "scheduler", local_scheduler)
+            except Exception:
+                logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
+
+        self._clone_mutable_attrs(self._base, local_pipe)
+
+        # 4) wrap tokenizers on the local pipe with the lock wrapper
+        tokenizer_wrappers = {}  # name -> original_tokenizer
+        try:
+            # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
+            for name in dir(local_pipe):
+                if "tokenizer" in name and not name.startswith("_"):
+                    tok = getattr(local_pipe, name, None)
+                    if tok is not None and self._is_tokenizer_component(tok):
+                        tokenizer_wrappers[name] = tok
+                        setattr(
+                            local_pipe,
+                            name,
+                            lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
+                        )
+
+            # b) wrap tokenizers in components dict
+            if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
+                for key, val in local_pipe.components.items():
+                    if val is None:
+                        continue
+                    
+                    if self._is_tokenizer_component(val):
+                        tokenizer_wrappers[f"components[{key}]"] = val
+                        local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
+                            tokenizer, *args, lock=self._tokenizer_lock, **kwargs
+                        )
+
+        except Exception as e:
+            logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
+
+        result = None
+        cm = getattr(local_pipe, "model_cpu_offload_context", None)
+        try:
+            if callable(cm):
+                try:
+                    with cm():
+                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                except TypeError:
+                    # cm might be a context manager instance rather than callable
+                    try:
+                        with cm:
+                            result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+                    except Exception as e:
+                        logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
+                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+            else:
+                # no offload context available — call directly
+                result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
+
+            return result
+
+        finally:
+            try:
+                for name, tok in tokenizer_wrappers.items():
+                    if name.startswith("components["):
+                        key = name[len("components["):-1]
+                        local_pipe.components[key] = tok
+                    else:
+                        setattr(local_pipe, name, tok)
+            except Exception as e:
+                logger.debug(f"Error restoring wrapped tokenizers: {e}")
\ No newline at end of file
diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/utils/scheduler.py
new file mode 100644
index 000000000000..a20715e254cd
--- /dev/null
+++ b/examples/server-async/utils/scheduler.py
@@ -0,0 +1,118 @@
+from typing import Any, Optional, Union, List
+import torch
+import copy
+import inspect
+
+class BaseAsyncScheduler:
+    def __init__(self, scheduler: Any):
+        pass
+
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        # I leave it as an example of what the Scheduler should do to implement it later
+        """local = copy.deepcopy(self)
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        return local"""
+        pass
+
+
+def async_retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
+    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Backwards compatible: by default the function behaves exactly as before and returns
+        (timesteps_tensor, num_inference_steps)
+
+    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
+    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
+    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
+        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Optional kwargs:
+        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
+            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
+            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
+
+    Returns:
+        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
+        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+    """
+    # pop our optional control kwarg (keeps compatibility)
+    return_scheduler = bool(kwargs.pop("return_scheduler", False))
+
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+
+    # choose scheduler to call set_timesteps on
+    scheduler_in_use = scheduler
+    if return_scheduler:
+        # Do not mutate the provided scheduler: prefer to clone if possible
+        if hasattr(scheduler, "clone_for_request"):
+            try:
+                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
+                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+            except Exception:
+                scheduler_in_use = copy.deepcopy(scheduler)
+        else:
+            # fallback deepcopy (scheduler tends to be smallish - acceptable)
+            scheduler_in_use = copy.deepcopy(scheduler)
+
+    # helper to test if set_timesteps supports a particular kwarg
+    def _accepts(param_name: str) -> bool:
+        try:
+            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
+        except (ValueError, TypeError):
+            # if signature introspection fails, be permissive and attempt the call later
+            return False
+
+    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
+    if timesteps is not None:
+        accepts_timesteps = _accepts("timesteps")
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
+    elif sigmas is not None:
+        accept_sigmas = _accepts("sigmas")
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+        num_inference_steps = len(timesteps_out)
+    else:
+        # default path
+        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps_out = scheduler_in_use.timesteps
+
+    if return_scheduler:
+        return timesteps_out, num_inference_steps, scheduler_in_use
+    return timesteps_out, num_inference_steps
\ No newline at end of file
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 1ae0156c71d6..42d20472bf0b 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -92,18 +92,10 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-) :
+):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -120,72 +112,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 class FluxPipeline(
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index de6200f30c84..d311b5b6df20 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -21,9 +21,8 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable
-import copy
-import threading
+from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
+
 
 import numpy as np
 import PIL.Image
@@ -71,8 +70,6 @@
 )
 from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card
 from ..utils.torch_utils import empty_device_cache, get_device, is_compiled_module
-import copy
-from types import SimpleNamespace
 
 
 if is_torch_npu_available():
@@ -182,275 +179,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-def safe_tokenize(tokenizer, *args, lock, **kwargs):
-    with lock:
-        return tokenizer(*args, **kwargs)
-
-
-class RequestScopedPipeline:
-    DEFAULT_MUTABLE_ATTRS = [
-        "_all_hooks",
-        "_offload_device",
-        "_progress_bar_config",
-        "_progress_bar",
-        "_rng_state",
-        "_last_seed",
-        "latents",
-    ]
-
-    def __init__(
-        self,
-        pipeline: Any,
-        mutable_attrs: Optional[Iterable[str]] = None,
-        auto_detect_mutables: bool = True,
-        tensor_numel_threshold: int = 1_000_000,
-        tokenizer_lock: Optional[threading.Lock] = None
-    ):
-        self._base = pipeline
-        self.unet = getattr(pipeline, "unet", None)
-        self.vae = getattr(pipeline, "vae", None)
-        self.text_encoder = getattr(pipeline, "text_encoder", None)
-        self.components = getattr(pipeline, "components", None)
-
-        self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
-        self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
-
-        self._auto_detect_mutables = bool(auto_detect_mutables)
-        self._tensor_numel_threshold = int(tensor_numel_threshold)
-
-        self._auto_detected_attrs: List[str] = []
-
-    def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
-        base_sched = getattr(self._base, "scheduler", None)
-        if base_sched is None:
-            return None
-
-        if hasattr(base_sched, "clone_for_request"):
-            try:
-                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs)
-            except Exception as e:
-                logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
-
-        try:
-            return copy.deepcopy(base_sched)
-        except Exception as e:
-            logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
-            return base_sched  
-
-    def _autodetect_mutables(self, max_attrs: int = 40):
-        if not self._auto_detect_mutables:
-            return []
-
-        if self._auto_detected_attrs:
-            return self._auto_detected_attrs
-
-        candidates: List[str] = []
-        seen = set()
-        for name in dir(self._base):
-            if name.startswith("__"):
-                continue
-            if name in self._mutable_attrs:
-                continue
-            if name in ("to", "save_pretrained", "from_pretrained"):
-                continue
-            try:
-                val = getattr(self._base, name)
-            except Exception:
-                continue
-
-            import types
-
-            # skip callables and modules
-            if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)):
-                continue
-
-            # containers -> candidate
-            if isinstance(val, (dict, list, set, tuple, bytearray)):
-                candidates.append(name)
-                seen.add(name)
-            else:
-                # try Tensor detection
-                try:
-                    if isinstance(val, torch.Tensor):
-                        if val.numel() <= self._tensor_numel_threshold:
-                            candidates.append(name)
-                            seen.add(name)
-                        else:
-                            logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}")
-                except Exception:
-                    continue
-
-            if len(candidates) >= max_attrs:
-                break
-
-        self._auto_detected_attrs = candidates
-        logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}")
-        return self._auto_detected_attrs
-
-    def _is_readonly_property(self, base_obj, attr_name: str) -> bool:
-        try:
-            cls = type(base_obj)
-            descriptor = getattr(cls, attr_name, None)
-            if isinstance(descriptor, property):
-                return descriptor.fset is None
-            if hasattr(descriptor, "__set__") is False and descriptor is not None:
-                return False
-        except Exception:
-            pass
-        return False
-
-    def _clone_mutable_attrs(self, base, local):
-        attrs_to_clone = list(self._mutable_attrs)
-        attrs_to_clone.extend(self._autodetect_mutables())
-
-        EXCLUDE_ATTRS = {"components",}
-
-        for attr in attrs_to_clone:
-            if attr in EXCLUDE_ATTRS:
-                logger.debug(f"Skipping excluded attr '{attr}'")
-                continue
-            if not hasattr(base, attr):
-                continue
-            if self._is_readonly_property(base, attr):
-                logger.debug(f"Skipping read-only property '{attr}'")
-                continue
-
-            try:
-                val = getattr(base, attr)
-            except Exception as e:
-                logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}")
-                continue
-
-            try:
-                if isinstance(val, dict):
-                    setattr(local, attr, dict(val))
-                elif isinstance(val, (list, tuple, set)):
-                    setattr(local, attr, list(val))
-                elif isinstance(val, bytearray):
-                    setattr(local, attr, bytearray(val))
-                else:
-                    # small tensors or atomic values
-                    if isinstance(val, torch.Tensor):
-                        if val.numel() <= self._tensor_numel_threshold:
-                            setattr(local, attr, val.clone())
-                        else:
-                            # don't clone big tensors, keep reference
-                            setattr(local, attr, val)
-                    else:
-                        try:
-                            setattr(local, attr, copy.copy(val))
-                        except Exception:
-                            # último recurso: asignar referencia
-                            setattr(local, attr, val)
-            except (AttributeError, TypeError) as e:
-                logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}")
-                # continuar sin fallar
-                continue
-            except Exception as e:
-                logger.debug(f"Unexpected error cloning attribute '{attr}': {e}")
-                continue
-
-    def _is_tokenizer_component(self, component) -> bool:
-        """Determina si un componente es un tokenizador basándose en métodos y atributos comunes."""
-        if component is None:
-            return False
-        
-        # Verificar métodos comunes de tokenizadores
-        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
-        has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
-        
-        # Verificar nombre de clase
-        class_name = component.__class__.__name__.lower()
-        has_tokenizer_in_name = 'tokenizer' in class_name
-        
-        # Verificar atributos comunes de tokenizadores
-        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
-        has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
-        
-        return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
-
-    def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
-        local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
-
-        try:
-            local_pipe = copy.copy(self._base)
-        except Exception as e:
-            logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
-            local_pipe = copy.deepcopy(self._base)
-
-        if local_scheduler is not None:
-            try:
-                setattr(local_pipe, "scheduler", local_scheduler)
-            except Exception:
-                logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
-
-        self._clone_mutable_attrs(self._base, local_pipe)
-
-        # 4) wrap tokenizers on the local pipe with the lock wrapper
-        tokenizer_wrappers = {}  # name -> original_tokenizer
-        try:
-            # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
-            for name in dir(local_pipe):
-                if "tokenizer" in name and not name.startswith("_"):
-                    tok = getattr(local_pipe, name, None)
-                    if tok is not None and self._is_tokenizer_component(tok):
-                        tokenizer_wrappers[name] = tok
-                        setattr(
-                            local_pipe,
-                            name,
-                            lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
-                        )
-
-            # b) wrap tokenizers in components dict - CORRECCIÓN CRÍTICA
-            if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
-                for key, val in local_pipe.components.items():
-                    if val is None:
-                        continue
-                    
-                    # Solo envolver si realmente ES un tokenizador
-                    if self._is_tokenizer_component(val):
-                        tokenizer_wrappers[f"components[{key}]"] = val
-                        # Crear una nueva función lambda que capture correctamente 'val'
-                        local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
-                            tokenizer, *args, lock=self._tokenizer_lock, **kwargs
-                        )
-
-        except Exception as e:
-            logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
-
-        result = None
-        cm = getattr(local_pipe, "model_cpu_offload_context", None)
-        try:
-            if callable(cm):
-                try:
-                    with cm():
-                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-                except TypeError:
-                    # cm might be a context manager instance rather than callable
-                    try:
-                        with cm:
-                            result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-                    except Exception as e:
-                        logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
-                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-            else:
-                # no offload context available — call directly
-                result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-
-            return result
-
-        finally:
-            try:
-                # Restaurar los tokenizadores originales
-                for name, tok in tokenizer_wrappers.items():
-                    if name.startswith("components["):
-                        key = name[len("components["):-1]
-                        local_pipe.components[key] = tok
-                    else:
-                        setattr(local_pipe, name, tok)
-            except Exception as e:
-                logger.debug(f"Error restoring wrapped tokenizers: {e}")
-
             
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):
     r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index ebc87f30a7f3..8023b4e77dc8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
-import copy
+from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -101,16 +100,8 @@ def retrieve_timesteps(
     **kwargs,
 ):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -127,72 +118,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 0ee5ad4bc949..4c3975dca2a4 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
-import copy
+from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
@@ -95,16 +94,8 @@ def retrieve_timesteps(
     **kwargs,
 ):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -121,72 +112,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 81f1580fce4a..b97cf6f1f6f8 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -120,18 +120,10 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-) :
+):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -148,73 +140,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    import copy
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 class StableDiffusionXLPipeline(
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index 63f40497afff..1ce6987114a7 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -136,18 +136,10 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-) :
+):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -164,72 +156,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 74a1a0bb1b22..2802d690f3cc 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -161,18 +161,10 @@ def retrieve_timesteps(
     timesteps: Optional[List[int]] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
-) :
+):
     r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 
     Args:
         scheduler (`SchedulerMixin`):
@@ -189,72 +181,36 @@ def retrieve_timesteps(
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
 
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
     Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
     """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
     if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" timestep schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                 f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
     else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 
 
 class StableDiffusionXLAdapterPipeline(
diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py
index ee767380e2f7..c4b336811cf4 100644
--- a/src/diffusers/schedulers/scheduling_amused.py
+++ b/src/diffusers/schedulers/scheduling_amused.py
@@ -7,7 +7,6 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
 from .scheduling_utils import SchedulerMixin
-import copy
 
 
 def gumbel_noise(t, generator=None):
@@ -162,7 +161,3 @@ def add_noise(self, sample, timesteps, generator=None):
 
         return masked_sample
 
-    def clone_for_request(self, num_inference_steps: int, temperature=(2, 0), device: Union[str, torch.device] = None):
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, temperature=temperature, device=device)
-        return local
diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py
index 7bf3ec6f4aeb..acb24ea04d84 100644
--- a/src/diffusers/schedulers/scheduling_consistency_decoder.py
+++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -8,7 +8,6 @@
 from ..utils import BaseOutput
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
-import copy
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -110,10 +109,7 @@ def set_timesteps(
         self.c_out = self.c_out.to(device)
         self.c_in = self.c_in.to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
+    
 
     @property
     def init_noise_sigma(self):
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 271369777301..56145cebcf6f 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -243,11 +243,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     # Modified _convert_to_karras implementation that takes in ramp as argument
     def _convert_to_karras(self, ramp):
diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
index ecda598b8ce3..0752435240c3 100644
--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -241,11 +241,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         # if a noise sampler is used, reinitialise it
         self.noise_sampler = None
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 9dc1006ee2a1..cd66070b69b6 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -339,11 +339,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
index 3e91077b7e50..efc04dd5023f 100644
--- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -302,11 +302,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index fba349c8fc9f..0ccf15828cee 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -286,11 +286,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 49107c9bca17..e61fe866a1ae 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -362,12 +362,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index be6d7ad4880d..7cc0c4cef1f1 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -322,11 +322,6 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 571aaf52bccc..4d48b7c307fb 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -332,11 +332,6 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
index 126956204880..61143179329a 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -161,11 +161,6 @@ def set_timesteps(
             timesteps = torch.Tensor(timesteps).to(device)
         self.timesteps = timesteps
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index 13adec66870c..e6581924e07d 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -318,11 +318,6 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
index 6de6d07f11c8..b6398399763c 100644
--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -303,11 +303,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
         lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 407215937fa6..d07ff8b2007b 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -457,12 +457,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index fd886b48eb22..06ff3c6c573a 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -330,11 +330,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 9bba69be9e49..9777a9ff54ee 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -412,11 +412,6 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.noise_sampler = None
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def _second_order_timesteps(self, sigmas, log_sigmas):
         def sigma_fn(_t):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 9d0bebe13d99..9cb72d021447 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -407,11 +407,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index 105603e01f8d..bff9b267a058 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -273,11 +273,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index 20d3be9756dc..c5e3d8145b0e 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -261,11 +261,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 5713ffcfdee0..e9cb3107bbe9 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -318,11 +318,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index fee2d03e5291..513ef662820e 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -449,11 +449,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index 258e8252f557..da4b69957097 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -348,11 +348,6 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index d31c6a9430cb..922a03a7fd34 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -123,10 +123,6 @@ def set_timesteps(
 
         self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
 
-    def clone_for_request(self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None):
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, sampling_eps=sampling_eps, device=device)
-        return local
 
     def set_sigmas(
         self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py
index 01a47bbd52a5..06063ddd3bfc 100644
--- a/src/diffusers/schedulers/scheduling_tcd.py
+++ b/src/diffusers/schedulers/scheduling_tcd.py
@@ -521,11 +521,6 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 4b07949ac30f..b825102dfda9 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -177,11 +177,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
         if prev_timestep is None:
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index b0bc1d1a8b16..38354555e9f3 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -430,11 +430,6 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps)
-        return local
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py
index 7ab4f151de65..5369901b7656 100644
--- a/src/diffusers/schedulers/scheduling_vq_diffusion.py
+++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -197,11 +197,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.log_cumprod_bt = self.log_cumprod_bt.to(device)
         self.log_cumprod_ct = self.log_cumprod_ct.to(device)
 
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        import copy
-        local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local
 
     def step(
         self,

From a519915a226ae6c717b34c3d92542b9188dc0d77 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 14 Sep 2025 20:29:50 -0600
Subject: [PATCH 21/34] Update examples/server-async/utils/*

---
 .../DiffusersServer/serverasync.py            |  2 +-
 .../{ => DiffusersServer}/utils/__init__.py   |  0
 .../utils/requestscopedpipeline.py            | 43 ++++++++++++++-----
 .../{ => DiffusersServer}/utils/scheduler.py  | 18 ++++----
 4 files changed, 44 insertions(+), 19 deletions(-)
 rename examples/server-async/{ => DiffusersServer}/utils/__init__.py (100%)
 rename examples/server-async/{ => DiffusersServer}/utils/requestscopedpipeline.py (86%)
 rename examples/server-async/{ => DiffusersServer}/utils/scheduler.py (95%)

diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
index 61eb99c3fdce..d345db595838 100644
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ b/examples/server-async/DiffusersServer/serverasync.py
@@ -5,7 +5,7 @@
 from pydantic import BaseModel
 from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger
 import logging
-from ..utils import RequestScopedPipeline
+from .utils import RequestScopedPipeline
 from diffusers import *
 import random
 import uuid
diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/DiffusersServer/utils/__init__.py
similarity index 100%
rename from examples/server-async/utils/__init__.py
rename to examples/server-async/DiffusersServer/utils/__init__.py
diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py
similarity index 86%
rename from examples/server-async/utils/requestscopedpipeline.py
rename to examples/server-async/DiffusersServer/utils/requestscopedpipeline.py
index 56f5626ed156..79f79e28f5e7 100644
--- a/examples/server-async/utils/requestscopedpipeline.py
+++ b/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py
@@ -3,6 +3,8 @@
 import threading
 import torch
 from diffusers.utils import logging
+from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
+
 
 logger = logging.get_logger(__name__)
 
@@ -27,7 +29,8 @@ def __init__(
         mutable_attrs: Optional[Iterable[str]] = None,
         auto_detect_mutables: bool = True,
         tensor_numel_threshold: int = 1_000_000,
-        tokenizer_lock: Optional[threading.Lock] = None
+        tokenizer_lock: Optional[threading.Lock] = None,
+        wrap_scheduler: bool = True
     ):
         self._base = pipeline
         self.unet = getattr(pipeline, "unet", None)
@@ -35,6 +38,10 @@ def __init__(
         self.text_encoder = getattr(pipeline, "text_encoder", None)
         self.components = getattr(pipeline, "components", None)
 
+        if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None:
+            if not isinstance(pipeline.scheduler, BaseAsyncScheduler):
+                pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler)
+
         self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
         self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
 
@@ -48,17 +55,24 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
         if base_sched is None:
             return None
 
-        if hasattr(base_sched, "clone_for_request"):
-            try:
-                return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs)
-            except Exception as e:
-                logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
+        if not isinstance(base_sched, BaseAsyncScheduler):
+            wrapped_scheduler = BaseAsyncScheduler(base_sched)
+        else:
+            wrapped_scheduler = base_sched
 
         try:
-            return copy.deepcopy(base_sched)
+            return wrapped_scheduler.clone_for_request(
+                num_inference_steps=num_inference_steps, 
+                device=device, 
+                **clone_kwargs
+            )
         except Exception as e:
-            logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
-            return base_sched  
+            logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
+            try:
+                return copy.deepcopy(wrapped_scheduler)
+            except Exception as e:
+                logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
+                return wrapped_scheduler  
 
     def _autodetect_mutables(self, max_attrs: int = 40):
         if not self._auto_detect_mutables:
@@ -197,7 +211,16 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
 
         if local_scheduler is not None:
             try:
-                setattr(local_pipe, "scheduler", local_scheduler)
+                timesteps, num_steps, configured_scheduler = async_retrieve_timesteps(
+                    local_scheduler.scheduler,
+                    num_inference_steps=num_inference_steps,
+                    device=device,
+                    return_scheduler=True,
+                    **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']}
+                )
+
+                final_scheduler = BaseAsyncScheduler(configured_scheduler)
+                setattr(local_pipe, "scheduler", final_scheduler)
             except Exception:
                 logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
 
diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/DiffusersServer/utils/scheduler.py
similarity index 95%
rename from examples/server-async/utils/scheduler.py
rename to examples/server-async/DiffusersServer/utils/scheduler.py
index a20715e254cd..848905985dd4 100644
--- a/examples/server-async/utils/scheduler.py
+++ b/examples/server-async/DiffusersServer/utils/scheduler.py
@@ -5,14 +5,16 @@
 
 class BaseAsyncScheduler:
     def __init__(self, scheduler: Any):
-        pass
-
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        # I leave it as an example of what the Scheduler should do to implement it later
-        """local = copy.deepcopy(self)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device)
-        return local"""
-        pass
+        self.scheduler = scheduler
+
+    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device, None] = None, **kwargs):
+        local = copy.deepcopy(self.scheduler)
+
+        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, **kwargs)
+
+        cloned = self.__class__(local)
+        
+        return cloned
 
 
 def async_retrieve_timesteps(

From 7cfee776c9b6d71d60c5a95469e0e873a582c9d2 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Sun, 14 Sep 2025 20:48:40 -0600
Subject: [PATCH 22/34] Fix BaseAsyncScheduler

---
 .../DiffusersServer/utils/scheduler.py        | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/examples/server-async/DiffusersServer/utils/scheduler.py b/examples/server-async/DiffusersServer/utils/scheduler.py
index 848905985dd4..5925edfeab04 100644
--- a/examples/server-async/DiffusersServer/utils/scheduler.py
+++ b/examples/server-async/DiffusersServer/utils/scheduler.py
@@ -7,15 +7,32 @@ class BaseAsyncScheduler:
     def __init__(self, scheduler: Any):
         self.scheduler = scheduler
 
+    def __getattr__(self, name: str):
+        if hasattr(self.scheduler, name):
+            return getattr(self.scheduler, name)
+        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+    
+    def __setattr__(self, name: str, value):
+        if name == 'scheduler':
+            super().__setattr__(name, value)
+        else:
+            if hasattr(self, 'scheduler') and hasattr(self.scheduler, name):
+                setattr(self.scheduler, name, value)
+            else:
+                super().__setattr__(name, value)
+
     def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device, None] = None, **kwargs):
         local = copy.deepcopy(self.scheduler)
-
         local.set_timesteps(num_inference_steps=num_inference_steps, device=device, **kwargs)
-
         cloned = self.__class__(local)
-        
         return cloned
 
+    def __repr__(self):
+        return f"BaseAsyncScheduler({repr(self.scheduler)})"
+    
+    def __str__(self):
+        return f"BaseAsyncScheduler wrapping: {str(self.scheduler)}"
+
 
 def async_retrieve_timesteps(
     scheduler,

From e574f07968ca2f2d47c69839378e37d7e8f09f61 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Mon, 15 Sep 2025 07:09:04 -0600
Subject: [PATCH 23/34] Rollback in the core of the diffusers

---
 src/diffusers/pipelines/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 86b4e22fb814..8ed07a72e3fd 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -522,7 +522,6 @@
             DiffusionPipeline,
             ImagePipelineOutput,
             StableDiffusionMixin,
-            RequestScopedPipeline
         )
 
     try:

From 10496638912eceaf951d1d3718442489d0db70c0 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Mon, 15 Sep 2025 12:49:53 -0600
Subject: [PATCH 24/34] Update examples/server-async/README.md

---
 examples/server-async/README.md | 66 +++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index edf07852c247..59c8cd6eda62 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -1,11 +1,10 @@
 # Asynchronous server and parallel execution of models
 
 > Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork.
-> We recommend running about 10 to 50 inferences in parallel to have a good performance of 25-30s to 1-1:30min on average
+> We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.)
 
 ## ⚠️ IMPORTANT
 
-* This example uses a custom Diffusers fork: `https://github.com/F4k3r22/diffusers-async`.
 * The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`.
   The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU.
 
@@ -15,7 +14,11 @@ All the components needed to create the inference server are in `DiffusersServer
 
 ```
 DiffusersServer/
-├── **init**.py
+├── utils/
+├─────── __init__.py
+├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences
+├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model
+├── __init__.py
 ├── create_server.py             # helper script to build/run the app programmatically
 ├── Pipelines.py                 # pipeline loader classes (SD3, Flux, legacy SD, video)
 ├── serverasync.py               # FastAPI app factory (create\_app\_fastapi)
@@ -29,10 +32,11 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r
 `diffusers-async` / this example addresses that by:
 
 * **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*.
-* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics.
-* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur.
-* **`retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API.
-* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors.
+* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. The system uses `BaseAsyncScheduler.clone_for_request(...)` for scheduler cloning, with fallback to safe `deepcopy` or other heuristics.
+* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock with automatic tokenizer detection and wrapping. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur.
+* **`async_retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API.
+* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. Configurable tensor size threshold prevents cloning of large tensors.
+* **Enhanced scheduler wrapping**: `BaseAsyncScheduler` automatically wraps schedulers with improved `__getattr__`, `__setattr__`, and debugging methods (`__repr__`, `__str__`).
 
 ## How the server works (high-level flow)
 
@@ -41,10 +45,12 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r
 
    * The server uses `RequestScopedPipeline.generate(...)` which:
 
+     * automatically wraps the base scheduler in `BaseAsyncScheduler` (if not already wrapped),
      * obtains a *local scheduler* (via `clone_for_request()` or `deepcopy`),
      * does `local_pipe = copy.copy(base_pipe)` (shallow copy),
      * sets `local_pipe.scheduler = local_scheduler` (if possible),
-     * clones only small mutable attributes (callbacks, rng, small latents),
+     * clones only small mutable attributes (callbacks, rng, small latents) with auto-detection,
+     * wraps tokenizers with thread-safe locks to prevent race conditions,
      * optionally enters a `model_cpu_offload_context()` for memory offload hooks,
      * calls the pipeline on the local view (`local_pipe(...)`).
 3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`).
@@ -56,14 +62,10 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r
 
 Recommended: create a virtualenv / conda environment.
 
-If using the `diffusers` fork via git, either:
-
-**A) Preinstall the fork first:**
-
 ```bash
-pip install "git+https://github.com/F4k3r22/diffusers-async.git@main"
+pip install diffusers
 pip install -r requirements.txt
-````
+```
 
 ### 2) Start the server
 
@@ -93,16 +95,42 @@ Response example:
 }
 ```
 
+## Advanced Configuration
+
+### RequestScopedPipeline Parameters
+
+```python
+RequestScopedPipeline(
+    pipeline,                        # Base pipeline to wrap
+    mutable_attrs=None,             # Custom list of attributes to clone
+    auto_detect_mutables=True,      # Enable automatic detection of mutable attributes
+    tensor_numel_threshold=1_000_000, # Tensor size threshold for cloning
+    tokenizer_lock=None,            # Custom threading lock for tokenizers
+    wrap_scheduler=True             # Auto-wrap scheduler in BaseAsyncScheduler
+)
+```
+
+### BaseAsyncScheduler Features
+
+* Transparent proxy to the original scheduler with `__getattr__` and `__setattr__`
+* `clone_for_request()` method for safe per-request scheduler cloning
+* Enhanced debugging with `__repr__` and `__str__` methods
+* Full compatibility with existing scheduler APIs
+
 ## Troubleshooting (quick)
 
 * `Already borrowed` — previously a Rust tokenizer concurrency error.
-  ✅ This is now fixed: `RequestScopedPipeline` manages an internal tokenizer lock so race conditions no longer happen.
+  ✅ This is now fixed: `RequestScopedPipeline` automatically detects and wraps tokenizers with thread locks, so race conditions no longer happen.
 
 * `can't set attribute 'components'` — pipeline exposes read-only `components`.
-
-  * The RequestScopedPipeline now detects read-only properties and skips setting them.
+  ✅ The RequestScopedPipeline now detects read-only properties and skips setting them automatically.
 
 * Scheduler issues:
+  * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `async_retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
+  ✅ Note: `async_retrieve_timesteps` is fully retro-compatible — if you don't pass `return_scheduler=True`, the behavior is unchanged.
+
+* Memory issues with large tensors:
+  ✅ The system now has configurable `tensor_numel_threshold` to prevent cloning of large tensors while still cloning small mutable ones.
 
-  * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
-  * ✅ Note: `retrieve_timesteps` is fully retro-compatible — if you don’t pass `return_scheduler=True`, the behavior is unchanged.
+* Automatic tokenizer detection:
+  ✅ The system automatically identifies tokenizer components by checking for tokenizer methods, class names, and attributes, then applies thread-safe wrappers.
\ No newline at end of file

From 531662085d82911558ace92ff33e2236406f70ae Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Mon, 15 Sep 2025 13:01:28 -0600
Subject: [PATCH 25/34] Complete rollback of diffusers core files

---
 src/diffusers/pipelines/flux/pipeline_flux.py                 | 4 ++--
 src/diffusers/pipelines/pipeline_utils.py                     | 2 --
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 2 +-
 .../stable_diffusion_3/pipeline_stable_diffusion_3.py         | 1 +
 src/diffusers/schedulers/scheduling_amused.py                 | 1 -
 src/diffusers/schedulers/scheduling_consistency_decoder.py    | 2 --
 src/diffusers/schedulers/scheduling_consistency_models.py     | 1 -
 .../schedulers/scheduling_cosine_dpmsolver_multistep.py       | 1 -
 src/diffusers/schedulers/scheduling_ddim.py                   | 1 -
 src/diffusers/schedulers/scheduling_ddim_cogvideox.py         | 1 -
 src/diffusers/schedulers/scheduling_ddim_inverse.py           | 1 -
 src/diffusers/schedulers/scheduling_ddim_parallel.py          | 1 -
 src/diffusers/schedulers/scheduling_ddpm.py                   | 1 -
 src/diffusers/schedulers/scheduling_ddpm_parallel.py          | 1 -
 src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py        | 1 -
 src/diffusers/schedulers/scheduling_deis_multistep.py         | 2 --
 src/diffusers/schedulers/scheduling_dpm_cogvideox.py          | 1 -
 .../schedulers/scheduling_dpmsolver_multistep_inverse.py      | 2 --
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py          | 1 -
 src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py   | 1 -
 .../schedulers/scheduling_edm_dpmsolver_multistep.py          | 1 -
 src/diffusers/schedulers/scheduling_edm_euler.py              | 1 -
 .../schedulers/scheduling_euler_ancestral_discrete.py         | 1 -
 src/diffusers/schedulers/scheduling_euler_discrete.py         | 1 -
 .../schedulers/scheduling_flow_match_euler_discrete.py        | 1 -
 src/diffusers/schedulers/scheduling_sde_ve.py                 | 2 --
 src/diffusers/schedulers/scheduling_tcd.py                    | 1 -
 src/diffusers/schedulers/scheduling_unclip.py                 | 1 -
 src/diffusers/schedulers/scheduling_unipc_multistep.py        | 2 --
 src/diffusers/schedulers/scheduling_vq_diffusion.py           | 1 -
 30 files changed, 4 insertions(+), 36 deletions(-)

diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 42d20472bf0b..5041e352f73d 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
-import copy
+from typing import Any, Callable, Dict, List, Optional, Union
+
 import numpy as np
 import torch
 from transformers import (
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index d311b5b6df20..01b3c56777c8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -23,7 +23,6 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
 
-
 import numpy as np
 import PIL.Image
 import requests
@@ -179,7 +178,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-            
 class DiffusionPipeline(ConfigMixin, PushToHubMixin):
     r"""
     Base class for all pipelines.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 8023b4e77dc8..cb97f18efeff 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
+
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -150,7 +151,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-
 class StableDiffusionPipeline(
     DiffusionPipeline,
     StableDiffusionMixin,
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 4c3975dca2a4..1618f89a49e3 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -14,6 +14,7 @@
 
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
+
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py
index c4b336811cf4..238b8d869171 100644
--- a/src/diffusers/schedulers/scheduling_amused.py
+++ b/src/diffusers/schedulers/scheduling_amused.py
@@ -160,4 +160,3 @@ def add_noise(self, sample, timesteps, generator=None):
         masked_sample[mask_indices] = self.config.mask_token_id
 
         return masked_sample
-
diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py
index acb24ea04d84..d7af018b284a 100644
--- a/src/diffusers/schedulers/scheduling_consistency_decoder.py
+++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -109,8 +109,6 @@ def set_timesteps(
         self.c_out = self.c_out.to(device)
         self.c_in = self.c_in.to(device)
 
-    
-
     @property
     def init_noise_sigma(self):
         return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]]
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 56145cebcf6f..0f5062258800 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -243,7 +243,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     # Modified _convert_to_karras implementation that takes in ramp as argument
     def _convert_to_karras(self, ramp):
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
index 0752435240c3..66ed296da8ea 100644
--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -241,7 +241,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         # if a noise sampler is used, reinitialise it
         self.noise_sampler = None
 
-
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index cd66070b69b6..5ee0d084f060 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -339,7 +339,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
index efc04dd5023f..c19efdc7834d 100644
--- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -302,7 +302,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 0ccf15828cee..49dba840d089 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -286,7 +286,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index e61fe866a1ae..7c3f03a8dbe1 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -362,7 +362,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 7cc0c4cef1f1..0fab6d910a82 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -322,7 +322,6 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 4d48b7c307fb..ec741f9ecb7d 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -332,7 +332,6 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
index 61143179329a..71f08277ebd7 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -161,7 +161,6 @@ def set_timesteps(
             timesteps = torch.Tensor(timesteps).to(device)
         self.timesteps = timesteps
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index e6581924e07d..7d8685ba10c3 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -317,8 +317,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
index b6398399763c..f7b63720e107 100644
--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -303,7 +303,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
         lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
         lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index 06ff3c6c573a..9ec958851111 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -329,8 +329,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._step_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 9777a9ff54ee..eeb06773d977 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -412,7 +412,6 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.noise_sampler = None
 
-
     def _second_order_timesteps(self, sigmas, log_sigmas):
         def sigma_fn(_t):
             return np.exp(-_t)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 9cb72d021447..8663210a6244 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -407,7 +407,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index bff9b267a058..f1b38aaff56c 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -273,7 +273,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index c5e3d8145b0e..dbeff3de5652 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -261,7 +261,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
     def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index e9cb3107bbe9..9cdaa2c5e101 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -318,7 +318,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 513ef662820e..f58d918dbfbe 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -449,7 +449,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index da4b69957097..1a4f12ddfa53 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -348,7 +348,6 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
-
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index 922a03a7fd34..1bfc08cce5e9 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -24,7 +24,6 @@
 from ..utils import BaseOutput
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
-import copy
 
 
 @dataclass
@@ -123,7 +122,6 @@ def set_timesteps(
 
         self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
 
-
     def set_sigmas(
         self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
     ):
diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py
index 06063ddd3bfc..3fd5c341eca9 100644
--- a/src/diffusers/schedulers/scheduling_tcd.py
+++ b/src/diffusers/schedulers/scheduling_tcd.py
@@ -521,7 +521,6 @@ def set_timesteps(
         self._step_index = None
         self._begin_index = None
 
-
     def step(
         self,
         model_output: torch.Tensor,
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index b825102dfda9..d78efabfbc57 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -177,7 +177,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-
     def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
         if prev_timestep is None:
             prev_timestep = t - 1
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index 38354555e9f3..162a34bd2774 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -429,8 +429,6 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-
-
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py
index 5369901b7656..57306301d023 100644
--- a/src/diffusers/schedulers/scheduling_vq_diffusion.py
+++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -197,7 +197,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.log_cumprod_bt = self.log_cumprod_bt.to(device)
         self.log_cumprod_ct = self.log_cumprod_ct.to(device)
 
-
     def step(
         self,
         model_output: torch.Tensor,

From 0ecdfc3ff5c3d4711b566332875ffe61cec2998b Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Tue, 16 Sep 2025 19:34:44 -0600
Subject: [PATCH 26/34] Simple implementation of an asynchronous server
 compatible with SD3-3.5 and Flux Pipelines

---
 .../server-async/DiffusersServer/__init__.py  |   2 -
 .../DiffusersServer/create_server.py          |  45 ---
 .../DiffusersServer/serverasync.py            | 340 ------------------
 .../DiffusersServer/utils/__init__.py         |   1 -
 .../DiffusersServer/uvicorn_diffu.py          |  66 ----
 .../{DiffusersServer => }/Pipelines.py        |  68 ++--
 examples/server-async/README.md               |   2 +-
 examples/server-async/server.py               |  11 -
 examples/server-async/serverasync.py          | 223 ++++++++++++
 examples/server-async/test.py                 |   2 +-
 examples/server-async/utils/__init__.py       |   2 +
 .../utils/requestscopedpipeline.py            |   0
 .../{DiffusersServer => }/utils/scheduler.py  |   0
 examples/server-async/utils/utils.py          |  44 +++
 14 files changed, 315 insertions(+), 491 deletions(-)
 delete mode 100644 examples/server-async/DiffusersServer/__init__.py
 delete mode 100644 examples/server-async/DiffusersServer/create_server.py
 delete mode 100644 examples/server-async/DiffusersServer/serverasync.py
 delete mode 100644 examples/server-async/DiffusersServer/utils/__init__.py
 delete mode 100644 examples/server-async/DiffusersServer/uvicorn_diffu.py
 rename examples/server-async/{DiffusersServer => }/Pipelines.py (59%)
 delete mode 100644 examples/server-async/server.py
 create mode 100644 examples/server-async/serverasync.py
 create mode 100644 examples/server-async/utils/__init__.py
 rename examples/server-async/{DiffusersServer => }/utils/requestscopedpipeline.py (100%)
 rename examples/server-async/{DiffusersServer => }/utils/scheduler.py (100%)
 create mode 100644 examples/server-async/utils/utils.py

diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py
deleted file mode 100644
index 0d8d5761a939..000000000000
--- a/examples/server-async/DiffusersServer/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .Pipelines import TextToImagePipelineSD3
-from .create_server import create_inference_server_Async as DiffusersServerApp
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py
deleted file mode 100644
index 7ccfd9c742f8..000000000000
--- a/examples/server-async/DiffusersServer/create_server.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# create_server.py
-
-from .Pipelines import *
-from .serverasync import *
-from .uvicorn_diffu import *
-import asyncio
-
-def create_inference_server_Async(
-    model:str,
-    type_model: str = 't2im',
-    host: str = '0.0.0.0',
-    port: int = 8500,
-    threads=5,
-    enable_memory_monitor=True,
-    custom_model: bool = False,
-    custom_pipeline: Optional[Type] | None = None,
-    constructor_pipeline: Optional[Type] | None = None,
-    components: Optional[Dict[str, Any]] = None,
-    api_name: Optional[str] = 'custom_api',
-    torch_dtype = torch.bfloat16
-):
-    config = ServerConfigModels(
-        model=model,
-        type_models=type_model,
-        custom_model=custom_model,
-        custom_pipeline=custom_pipeline,
-        constructor_pipeline=constructor_pipeline,
-        components=components,
-        api_name=api_name,
-        torch_dtype=torch_dtype,
-        host=host,
-        port=port
-    )
-
-    app = create_app_fastapi(config)
-
-    asyncio.run(run_uvicorn_server(
-        app, 
-        host=host, 
-        port=port, 
-        workers=threads,
-        enable_memory_monitor=enable_memory_monitor
-    ))
-
-    return app
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py
deleted file mode 100644
index d345db595838..000000000000
--- a/examples/server-async/DiffusersServer/serverasync.py
+++ /dev/null
@@ -1,340 +0,0 @@
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import FileResponse  
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.concurrency import run_in_threadpool
-from pydantic import BaseModel
-from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger
-import logging
-from .utils import RequestScopedPipeline
-from diffusers import *
-import random
-import uuid
-import tempfile
-from dataclasses import dataclass
-import os
-import torch
-import threading
-import gc
-from typing import Optional, Dict, Any, Type
-from dataclasses import dataclass, field
-from typing import List
-from contextlib import asynccontextmanager
-import asyncio
-
-@dataclass
-class PresetModels:
-    SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
-    SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium'])
-    Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell'])
-
-class ModelPipelineInitializer:
-    def __init__(self, model: str = '', type_models: str = 't2im'):
-        self.model = model
-        self.type_models = type_models
-        self.pipeline = None
-        self.device = "cuda" if torch.cuda.is_available() else "mps"
-        self.model_type = None
-
-    def initialize_pipeline(self):
-        if not self.model:
-            raise ValueError("Model name not provided")
-
-        # Check if model exists in PresetModels
-        preset_models = PresetModels()
-
-        # Determine which model type we're dealing with
-        if self.model in preset_models.SD3:
-            self.model_type = "SD3"
-        elif self.model in preset_models.SD3_5:
-            self.model_type = "SD3_5"
-        elif self.model in preset_models.Flux:
-            self.model_type = "Flux"
-        else:
-            self.model_type = "SD"
-
-        # Create appropriate pipeline based on model type and type_models
-        if self.type_models == 't2im':
-            if self.model_type in ["SD3", "SD3_5"]:
-                self.pipeline = TextToImagePipelineSD3(self.model)
-            elif self.model_type == "Flux":
-                self.pipeline = TextToImagePipelineFlux(self.model)
-            elif self.model_type == "SD":
-                self.pipeline = TextToImagePipelineSD(self.model)
-            else:
-                raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
-        elif self.type_models == 't2v':
-            raise ValueError(f"Unsupported type_models: {self.type_models}")
-
-        return self.pipeline
-
-class Utils:
-    def __init__(self, host: str = '0.0.0.0', port: int = 8500):
-        self.service_url = f"http://{host}:{port}"
-        self.image_dir = os.path.join(tempfile.gettempdir(), "images")
-        if not os.path.exists(self.image_dir):
-            os.makedirs(self.image_dir)
-
-        self.video_dir = os.path.join(tempfile.gettempdir(), "videos")
-        if not os.path.exists(self.video_dir):
-            os.makedirs(self.video_dir)
-
-    def save_image(self, image):
-        if hasattr(image, "to"):
-            try:
-                image = image.to("cpu")
-            except Exception:
-                pass
-
-        if isinstance(image, torch.Tensor):
-            from torchvision import transforms
-            to_pil = transforms.ToPILImage()
-            image = to_pil(image.squeeze(0).clamp(0, 1))
-
-        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
-        image_path = os.path.join(self.image_dir, filename)
-        logger.info(f"Saving image to {image_path}")
-
-        image.save(image_path, format="PNG", optimize=True)
-
-        del image
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-
-        return os.path.join(self.service_url, "images", filename)
-
-@dataclass
-class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium'  
-    type_models: str = 't2im'  
-    custom_model : bool = False
-    constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None  
-    components: Optional[Dict[str, Any]] = None
-    api_name: Optional[str] = 'custom_api'
-    torch_dtype: Optional[torch.dtype] = None
-    host: str = '0.0.0.0' 
-    port: int = 8500
-
-def create_app_fastapi(config: ServerConfigModels) -> FastAPI:
-
-    server_config = config or ServerConfigModels()
-
-    @asynccontextmanager
-    async def lifespan(app: FastAPI):
-        logging.basicConfig(level=logging.INFO)
-        app.state.logger = logging.getLogger("diffusers-server")
-        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
-        os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
-
-        app.state.total_requests = 0
-        app.state.active_inferences = 0
-        app.state.metrics_lock = asyncio.Lock()
-        app.state.metrics_task = None
-
-        app.state.utils_app = Utils(
-            host=server_config.host,
-            port=server_config.port,
-        )
-
-        async def metrics_loop():
-            try:
-                while True:
-                    async with app.state.metrics_lock:
-                        total = app.state.total_requests
-                        active = app.state.active_inferences
-                    app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}")
-                    await asyncio.sleep(5)
-            except asyncio.CancelledError:
-                app.state.logger.info("Metrics loop cancelled")
-                raise
-
-        app.state.metrics_task = asyncio.create_task(metrics_loop())
-
-        try:
-            yield
-        finally:
-            task = app.state.metrics_task
-            if task:
-                task.cancel()
-                try:
-                    await task
-                except asyncio.CancelledError:
-                    pass
-
-            try:
-                stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
-                if callable(stop_fn):
-                    await run_in_threadpool(stop_fn)
-            except Exception as e:
-                app.state.logger.warning(f"Error during pipeline shutdown: {e}")
-
-            app.state.logger.info("Lifespan shutdown complete")
-
-    app = FastAPI(lifespan=lifespan)
-
-    logger = logging.getLogger("DiffusersServer.Pipelines")
-
-    if server_config.custom_model:
-        if server_config.constructor_pipeline is None:
-            raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required")
-
-        initializer = server_config.constructor_pipeline(
-            model_path=server_config.model,
-            pipeline=server_config.custom_pipeline,
-            torch_dtype=server_config.torch_dtype,
-            components=server_config.components,
-        )
-        model_pipeline = initializer.start()
-        request_pipe = None
-        pipeline_lock = threading.Lock()
-
-    else:
-        initializer = ModelPipelineInitializer(
-            model=server_config.model,
-            type_models=server_config.type_models,
-        )
-        model_pipeline = initializer.initialize_pipeline()
-        model_pipeline.start()
-
-        request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
-        pipeline_lock = threading.Lock()
-
-    logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
-
-    app.state.MODEL_INITIALIZER = initializer
-    app.state.MODEL_PIPELINE = model_pipeline
-    app.state.REQUEST_PIPE = request_pipe
-    app.state.PIPELINE_LOCK = pipeline_lock
-
-    class JSONBodyQueryAPI(BaseModel):
-        model : str | None = None
-        prompt : str
-        negative_prompt : str | None = None
-        num_inference_steps : int = 28
-        num_images_per_prompt : int = 1
-
-    @app.middleware("http")
-    async def count_requests_middleware(request: Request, call_next):
-        async with app.state.metrics_lock:
-            app.state.total_requests += 1
-        response = await call_next(request)
-        return response
-
-
-    @app.get("/")
-    async def root():
-        return {"message": "Welcome to the Diffusers Server"}
-
-    @app.post("/api/diffusers/inference")
-    async def api(json: JSONBodyQueryAPI):
-        prompt                = json.prompt
-        negative_prompt       = json.negative_prompt or ""
-        num_steps             = json.num_inference_steps
-        num_images_per_prompt = json.num_images_per_prompt
-
-        wrapper     = app.state.MODEL_PIPELINE   
-        initializer = app.state.MODEL_INITIALIZER
-
-        utils_app = app.state.utils_app
-
-
-        if not wrapper or not wrapper.pipeline:
-            raise HTTPException(500, "Model not initialized correctly")
-        if not prompt.strip():
-            raise HTTPException(400, "No prompt provided")
-
-
-        def make_generator():
-            g = torch.Generator(device=initializer.device)
-            return g.manual_seed(random.randint(0, 10_000_000))
-
-        req_pipe = app.state.REQUEST_PIPE
-
-        def infer():
-            gen = make_generator()
-            return req_pipe.generate(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                generator=gen,
-                num_inference_steps=num_steps,
-                num_images_per_prompt=num_images_per_prompt,
-                device=initializer.device,
-                output_type="pil",
-            )
-
-        try:
-            async with app.state.metrics_lock:
-                app.state.active_inferences += 1
-
-            output = await run_in_threadpool(infer)
-
-            async with app.state.metrics_lock:
-                app.state.active_inferences = max(0, app.state.active_inferences - 1)
-                
-            urls = [utils_app.save_image(img) for img in output.images]
-            return {"response": urls}
-
-        except Exception as e:
-            async with app.state.metrics_lock:
-                app.state.active_inferences = max(0, app.state.active_inferences - 1)
-            logger.error(f"Error during inference: {e}")
-            raise HTTPException(500, f"Error in processing: {e}")
-
-        finally:
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-                torch.cuda.empty_cache()
-                torch.cuda.reset_peak_memory_stats()
-                torch.cuda.ipc_collect()
-            gc.collect()
-
-
-    @app.get("/images/{filename}")
-    async def serve_image(filename: str):
-        utils_app = app.state.utils_app
-        file_path = os.path.join(utils_app.image_dir, filename)
-        if not os.path.isfile(file_path):
-            raise HTTPException(status_code=404, detail="Image not found")
-        return FileResponse(file_path, media_type="image/png")
-
-    @app.get("/api/models")
-    async def list_models():
-        return {
-            "current_model" : server_config.model,
-            "type" : server_config.type_models,
-            "all_models": {
-                "type": "T2Img",
-                "SD3": PresetModels().SD3,
-                "SD3_5": PresetModels().SD3_5,
-                "Flux": PresetModels().Flux,
-            }
-        }
-
-    @app.get("/api/status")
-    async def get_status():
-        memory_info = {}
-        if torch.cuda.is_available():
-            memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
-            memory_reserved = torch.cuda.memory_reserved() / 1024**3    # GB
-            memory_info = {
-                "memory_allocated_gb": round(memory_allocated, 2),
-                "memory_reserved_gb": round(memory_reserved, 2),
-                "device": torch.cuda.get_device_name(0)
-            }
-
-        return {
-            "current_model" : server_config.model,
-            "type_models" : server_config.type_models,
-            "memory" : memory_info}
-        
-
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=["*"], 
-        allow_credentials=True,
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-
-    return app
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/utils/__init__.py b/examples/server-async/DiffusersServer/utils/__init__.py
deleted file mode 100644
index 38b01f7aa59d..000000000000
--- a/examples/server-async/DiffusersServer/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .requestscopedpipeline import RequestScopedPipeline
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py
deleted file mode 100644
index c2688e25497d..000000000000
--- a/examples/server-async/DiffusersServer/uvicorn_diffu.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import uvicorn
-import logging
-import gc
-import psutil
-import os
-import threading
-import time
-
-def setup_logging():
-    logging.basicConfig(level=logging.INFO)
-    return logging.getLogger('uvicorn')
-
-logger = setup_logging()
-
-def memory_cleanup(interval=30):
-    while True:
-        try:
-            gc.collect()
-            
-            process = psutil.Process(os.getpid())
-            mem = process.memory_info().rss / 1024 / 1024
-            logger.info(f"Memory in use: {mem:.2f} MB")
-            
-            time.sleep(interval)
-        except Exception as e:
-            logger.error(f"Memory clearing error: {str(e)}")
-            time.sleep(interval)
-
-def run_uvicorn_server(
-    app, 
-    host='0.0.0.0', 
-    port=8500, 
-    workers=5, 
-    cleanup_interval=30, 
-    channel_timeout=900,
-    headers=[               
-        ("server", "DiffusersServer")
-    ],
-    enable_memory_monitor=True
-):
-    gc.enable()
-    gc.set_threshold(700, 10, 5)
-    
-    if enable_memory_monitor:
-        cleanup_thread = threading.Thread(
-            target=memory_cleanup, 
-            args=(cleanup_interval,), 
-            daemon=True
-        )
-        cleanup_thread.start()
-        logger.info("Memory monitor activated")
-    
-    logger.info(f"Starting Uvicorn server in {host}:{port}...")
-
-    config = uvicorn.Config(
-        app=app,
-        host=host,
-        workers=workers,
-        port=port,
-        timeout_keep_alive=channel_timeout,
-        headers=headers
-    )
-
-    server = uvicorn.Server(config)
-
-    return server.serve()
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/Pipelines.py
similarity index 59%
rename from examples/server-async/DiffusersServer/Pipelines.py
rename to examples/server-async/Pipelines.py
index bc60d4811c3e..dcf5f6eed596 100644
--- a/examples/server-async/DiffusersServer/Pipelines.py
+++ b/examples/server-async/Pipelines.py
@@ -1,11 +1,12 @@
 # Pipelines.py
 from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
 import torch
 import os
 import logging
 from pydantic import BaseModel
+from dataclasses import dataclass,  field
+from typing import List
 
 logger = logging.getLogger(__name__)
 
@@ -15,6 +16,13 @@ class TextToImageInput(BaseModel):
     size: str | None = None
     n: int | None = None
 
+
+@dataclass
+class PresetModels:
+    SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
+    SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium'])
+    Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell'])
+
 class TextToImagePipelineSD3:
     def __init__(self, model_path: str | None = None):
         self.model_path = model_path or os.getenv("MODEL_PATH")
@@ -72,28 +80,40 @@ def start(self):
         else:
             raise Exception("No CUDA or MPS device available")
 
-class TextToImagePipelineSD:
-    def __init__(self, model_path: str | None = None):
-        self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: StableDiffusionPipeline | None = None
-        self.device: str | None = None
+class ModelPipelineInitializer:
+    def __init__(self, model: str = '', type_models: str = 't2im'):
+        self.model = model
+        self.type_models = type_models
+        self.pipeline = None
+        self.device = "cuda" if torch.cuda.is_available() else "mps"
+        self.model_type = None
 
-    def start(self):
-        if torch.cuda.is_available():
-            model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
-            logger.info("Loading CUDA")
-            self.device = "cuda" 
-            self.pipeline = StableDiffusionPipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.float16,
-            ).to(device=self.device)
-        elif torch.backends.mps.is_available():
-            model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5"
-            logger.info("Loading MPS for Mac M Series")
-            self.device = "mps"
-            self.pipeline = StableDiffusionPipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.float16,
-            ).to(device=self.device)
+    def initialize_pipeline(self):
+        if not self.model:
+            raise ValueError("Model name not provided")
+
+        # Check if model exists in PresetModels
+        preset_models = PresetModels()
+
+        # Determine which model type we're dealing with
+        if self.model in preset_models.SD3:
+            self.model_type = "SD3"
+        elif self.model in preset_models.SD3_5:
+            self.model_type = "SD3_5"
+        elif self.model in preset_models.Flux:
+            self.model_type = "Flux"
         else:
-            raise Exception("No CUDA or MPS device available")
+            self.model_type = "SD"
+
+        # Create appropriate pipeline based on model type and type_models
+        if self.type_models == 't2im':
+            if self.model_type in ["SD3", "SD3_5"]:
+                self.pipeline = TextToImagePipelineSD3(self.model)
+            elif self.model_type == "Flux":
+                self.pipeline = TextToImagePipelineFlux(self.model)
+            else:
+                raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
+        elif self.type_models == 't2v':
+            raise ValueError(f"Unsupported type_models: {self.type_models}")
+
+        return self.pipeline
\ No newline at end of file
diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index 59c8cd6eda62..ce5b01724729 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -1,6 +1,6 @@
 # Asynchronous server and parallel execution of models
 
-> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork.
+> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines.
 > We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.)
 
 ## ⚠️ IMPORTANT
diff --git a/examples/server-async/server.py b/examples/server-async/server.py
deleted file mode 100644
index 590522038a53..000000000000
--- a/examples/server-async/server.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# DiffusersServerApp already handles the inference server and everything else internally, you 
-# just need to do these basic configurations and run the script with "python server.py" 
-# and you already get access to the inference APIs.
-from DiffusersServer import DiffusersServerApp
-
-app = DiffusersServerApp(
-    model='stabilityai/stable-diffusion-3.5-medium',
-    type_model='t2im',
-    threads=3,
-    enable_memory_monitor=True
-)
\ No newline at end of file
diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py
new file mode 100644
index 000000000000..0cf1724e70ac
--- /dev/null
+++ b/examples/server-async/serverasync.py
@@ -0,0 +1,223 @@
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import FileResponse  
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.concurrency import run_in_threadpool
+from pydantic import BaseModel
+from .Pipelines import ModelPipelineInitializer
+from .utils import Utils, RequestScopedPipeline
+import logging
+from diffusers import *
+import random
+from dataclasses import dataclass
+import os
+import torch
+import threading
+import gc
+from typing import Optional, Dict, Any, Type
+from contextlib import asynccontextmanager
+import asyncio
+
+
+@dataclass
+class ServerConfigModels:
+    model: str = 'stabilityai/stable-diffusion-3-medium'  
+    type_models: str = 't2im'  
+    constructor_pipeline: Optional[Type] = None
+    custom_pipeline: Optional[Type] = None  
+    components: Optional[Dict[str, Any]] = None
+    torch_dtype: Optional[torch.dtype] = None
+    host: str = '0.0.0.0' 
+    port: int = 8500
+
+server_config = ServerConfigModels()
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logging.basicConfig(level=logging.INFO)
+    app.state.logger = logging.getLogger("diffusers-server")
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
+    os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
+
+    app.state.total_requests = 0
+    app.state.active_inferences = 0
+    app.state.metrics_lock = asyncio.Lock()
+    app.state.metrics_task = None
+
+    app.state.utils_app = Utils(
+        host=server_config.host,
+        port=server_config.port,
+    )
+
+    async def metrics_loop():
+        try:
+            while True:
+                async with app.state.metrics_lock:
+                    total = app.state.total_requests
+                    active = app.state.active_inferences
+                app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}")
+                await asyncio.sleep(5)
+        except asyncio.CancelledError:
+            app.state.logger.info("Metrics loop cancelled")
+            raise
+
+    app.state.metrics_task = asyncio.create_task(metrics_loop())
+
+    try:
+        yield
+    finally:
+        task = app.state.metrics_task
+        if task:
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+
+        try:
+            stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
+            if callable(stop_fn):
+                await run_in_threadpool(stop_fn)
+        except Exception as e:
+            app.state.logger.warning(f"Error during pipeline shutdown: {e}")
+
+        app.state.logger.info("Lifespan shutdown complete")
+
+app = FastAPI(lifespan=lifespan)
+
+logger = logging.getLogger("DiffusersServer.Pipelines")
+
+
+        
+initializer = ModelPipelineInitializer(
+    model=server_config.model,
+    type_models=server_config.type_models,
+)
+model_pipeline = initializer.initialize_pipeline()
+model_pipeline.start()
+
+request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
+pipeline_lock = threading.Lock()
+
+logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
+
+app.state.MODEL_INITIALIZER = initializer
+app.state.MODEL_PIPELINE = model_pipeline
+app.state.REQUEST_PIPE = request_pipe
+app.state.PIPELINE_LOCK = pipeline_lock
+
+class JSONBodyQueryAPI(BaseModel):
+    model : str | None = None
+    prompt : str
+    negative_prompt : str | None = None
+    num_inference_steps : int = 28
+    num_images_per_prompt : int = 1
+
+@app.middleware("http")
+async def count_requests_middleware(request: Request, call_next):
+    async with app.state.metrics_lock:
+        app.state.total_requests += 1
+    response = await call_next(request)
+    return response
+
+
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Diffusers Server"}
+
+@app.post("/api/diffusers/inference")
+async def api(json: JSONBodyQueryAPI):
+    prompt                = json.prompt
+    negative_prompt       = json.negative_prompt or ""
+    num_steps             = json.num_inference_steps
+    num_images_per_prompt = json.num_images_per_prompt
+
+    wrapper     = app.state.MODEL_PIPELINE   
+    initializer = app.state.MODEL_INITIALIZER
+
+    utils_app = app.state.utils_app
+
+
+    if not wrapper or not wrapper.pipeline:
+        raise HTTPException(500, "Model not initialized correctly")
+    if not prompt.strip():
+        raise HTTPException(400, "No prompt provided")
+
+
+    def make_generator():
+        g = torch.Generator(device=initializer.device)
+        return g.manual_seed(random.randint(0, 10_000_000))
+
+    req_pipe = app.state.REQUEST_PIPE
+
+    def infer():
+        gen = make_generator()
+        return req_pipe.generate(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            generator=gen,
+            num_inference_steps=num_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            device=initializer.device,
+            output_type="pil",
+        )
+
+    try:
+        async with app.state.metrics_lock:
+            app.state.active_inferences += 1
+
+        output = await run_in_threadpool(infer)
+
+        async with app.state.metrics_lock:
+            app.state.active_inferences = max(0, app.state.active_inferences - 1)
+                
+        urls = [utils_app.save_image(img) for img in output.images]
+        return {"response": urls}
+
+    except Exception as e:
+        async with app.state.metrics_lock:
+            app.state.active_inferences = max(0, app.state.active_inferences - 1)
+        logger.error(f"Error during inference: {e}")
+        raise HTTPException(500, f"Error in processing: {e}")
+
+    finally:
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.ipc_collect()
+        gc.collect()
+
+
+@app.get("/images/{filename}")
+async def serve_image(filename: str):
+    utils_app = app.state.utils_app
+    file_path = os.path.join(utils_app.image_dir, filename)
+    if not os.path.isfile(file_path):
+        raise HTTPException(status_code=404, detail="Image not found")
+    return FileResponse(file_path, media_type="image/png")
+
+@app.get("/api/status")
+async def get_status():
+    memory_info = {}
+    if torch.cuda.is_available():
+        memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+        memory_reserved = torch.cuda.memory_reserved() / 1024**3    # GB
+        memory_info = {
+            "memory_allocated_gb": round(memory_allocated, 2),
+            "memory_reserved_gb": round(memory_reserved, 2),
+            "device": torch.cuda.get_device_name(0)
+        }
+
+    return {
+        "current_model" : server_config.model,
+        "type_models" : server_config.type_models,
+        "memory" : memory_info}
+        
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], 
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
diff --git a/examples/server-async/test.py b/examples/server-async/test.py
index 2a68c77bb28f..2c27146d0bd0 100644
--- a/examples/server-async/test.py
+++ b/examples/server-async/test.py
@@ -5,7 +5,7 @@
 
 SERVER_URL = "http://localhost:8500/api/diffusers/inference"
 BASE_URL = "http://localhost:8500"
-DOWNLOAD_FOLDER = "imagenes_generadas"
+DOWNLOAD_FOLDER = "generated_images"
 WAIT_BEFORE_DOWNLOAD = 2  # seconds
 
 os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py
new file mode 100644
index 000000000000..741cd9bb0219
--- /dev/null
+++ b/examples/server-async/utils/__init__.py
@@ -0,0 +1,2 @@
+from .requestscopedpipeline import RequestScopedPipeline
+from .utils import Utils
\ No newline at end of file
diff --git a/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
similarity index 100%
rename from examples/server-async/DiffusersServer/utils/requestscopedpipeline.py
rename to examples/server-async/utils/requestscopedpipeline.py
diff --git a/examples/server-async/DiffusersServer/utils/scheduler.py b/examples/server-async/utils/scheduler.py
similarity index 100%
rename from examples/server-async/DiffusersServer/utils/scheduler.py
rename to examples/server-async/utils/scheduler.py
diff --git a/examples/server-async/utils/utils.py b/examples/server-async/utils/utils.py
new file mode 100644
index 000000000000..e3dbb45677e1
--- /dev/null
+++ b/examples/server-async/utils/utils.py
@@ -0,0 +1,44 @@
+import os
+import tempfile
+import torch
+import uuid
+import gc
+import logging
+
+logger = logging.getLogger(__name__)
+
+class Utils:
+    def __init__(self, host: str = '0.0.0.0', port: int = 8500):
+        self.service_url = f"http://{host}:{port}"
+        self.image_dir = os.path.join(tempfile.gettempdir(), "images")
+        if not os.path.exists(self.image_dir):
+            os.makedirs(self.image_dir)
+
+        self.video_dir = os.path.join(tempfile.gettempdir(), "videos")
+        if not os.path.exists(self.video_dir):
+            os.makedirs(self.video_dir)
+
+    def save_image(self, image):
+        if hasattr(image, "to"):
+            try:
+                image = image.to("cpu")
+            except Exception:
+                pass
+
+        if isinstance(image, torch.Tensor):
+            from torchvision import transforms
+            to_pil = transforms.ToPILImage()
+            image = to_pil(image.squeeze(0).clamp(0, 1))
+
+        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
+        image_path = os.path.join(self.image_dir, filename)
+        logger.info(f"Saving image to {image_path}")
+
+        image.save(image_path, format="PNG", optimize=True)
+
+        del image
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        return os.path.join(self.service_url, "images", filename)
\ No newline at end of file

From ac5c9e6d3a6c3014741a974ad48550234896df86 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Tue, 16 Sep 2025 19:48:26 -0600
Subject: [PATCH 27/34] Update examples/server-async/README.md

---
 examples/server-async/Pipelines.py   |  3 --
 examples/server-async/README.md      | 63 +++++++++++++++++++++-------
 examples/server-async/serverasync.py |  5 +++
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py
index dcf5f6eed596..d0012251da5d 100644
--- a/examples/server-async/Pipelines.py
+++ b/examples/server-async/Pipelines.py
@@ -1,4 +1,3 @@
-# Pipelines.py
 from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
 import torch
@@ -102,8 +101,6 @@ def initialize_pipeline(self):
             self.model_type = "SD3_5"
         elif self.model in preset_models.Flux:
             self.model_type = "Flux"
-        else:
-            self.model_type = "SD"
 
         # Create appropriate pipeline based on model type and type_models
         if self.type_models == 't2im':
diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index ce5b01724729..6842d59486e9 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -5,24 +5,24 @@
 
 ## ⚠️ IMPORTANT
 
-* The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`.
-  The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU.
+* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU.
 
 ## Necessary components
 
-All the components needed to create the inference server are in `DiffusersServer/`
+All the components needed to create the inference server are in the current directory:
 
 ```
-DiffusersServer/
+server-async/
 ├── utils/
 ├─────── __init__.py
-├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences
-├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model
-├── __init__.py
-├── create_server.py             # helper script to build/run the app programmatically
-├── Pipelines.py                 # pipeline loader classes (SD3, Flux, legacy SD, video)
-├── serverasync.py               # FastAPI app factory (create\_app\_fastapi)
-├── uvicorn_diffu.py             # convenience script to start uvicorn with recommended flags
+├─────── scheduler.py              # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences
+├─────── requestscopedpipeline.py  # RequestScoped Pipeline for inference with a single in-memory model
+├─────── utils.py                  # Image/video saving utilities and service configuration
+├── Pipelines.py                   # pipeline loader classes (SD3, Flux, legacy SD, video)
+├── serverasync.py                 # FastAPI app with lifespan management and async inference endpoints
+├── test.py                        # Client test script for inference requests
+├── requirements.txt               # Dependencies
+└── README.md                      # This documentation
 ```
 
 ## What `diffusers-async` adds / Why we needed it
@@ -69,13 +69,28 @@ pip install -r requirements.txt
 
 ### 2) Start the server
 
-Using the `server.py` file that already has everything you need:
+Using the `serverasync.py` file that already has everything you need:
 
 ```bash
-python server.py
+python serverasync.py
 ```
 
-### 3) Example request
+The server will start on `http://localhost:8500` by default with the following features:
+- FastAPI application with async lifespan management
+- Automatic model loading and pipeline initialization
+- Request counting and active inference tracking
+- Memory cleanup after each inference
+- CORS middleware for cross-origin requests
+
+### 3) Test the server
+
+Use the included test script:
+
+```bash
+python test.py
+```
+
+Or send a manual request:
 
 `POST /api/diffusers/inference` with JSON body:
 
@@ -95,6 +110,13 @@ Response example:
 }
 ```
 
+### 4) Server endpoints
+
+- `GET /` - Welcome message
+- `POST /api/diffusers/inference` - Main inference endpoint
+- `GET /images/{filename}` - Serve generated images
+- `GET /api/status` - Server status and memory info
+
 ## Advanced Configuration
 
 ### RequestScopedPipeline Parameters
@@ -117,6 +139,19 @@ RequestScopedPipeline(
 * Enhanced debugging with `__repr__` and `__str__` methods
 * Full compatibility with existing scheduler APIs
 
+### Server Configuration
+
+The server configuration can be modified in `serverasync.py` through the `ServerConfigModels` dataclass:
+
+```python
+@dataclass
+class ServerConfigModels:
+    model: str = 'stabilityai/stable-diffusion-3-medium'  
+    type_models: str = 't2im'  
+    host: str = '0.0.0.0' 
+    port: int = 8500
+```
+
 ## Troubleshooting (quick)
 
 * `Already borrowed` — previously a Rust tokenizer concurrency error.
diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py
index 0cf1724e70ac..1723eb119849 100644
--- a/examples/server-async/serverasync.py
+++ b/examples/server-async/serverasync.py
@@ -221,3 +221,8 @@ async def get_status():
     allow_methods=["*"],
     allow_headers=["*"],
 )
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host=server_config.host, port=server_config.port)
\ No newline at end of file

From 72e021564da2dc7b5f395e9d7e7d0c6c04522c68 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Tue, 16 Sep 2025 20:07:38 -0600
Subject: [PATCH 28/34] Fixed import errors in
 'examples/server-async/serverasync.py'

---
 examples/server-async/README.md      | 2 +-
 examples/server-async/serverasync.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index 6842d59486e9..d3feb9a092ab 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -146,7 +146,7 @@ The server configuration can be modified in `serverasync.py` through the `Server
 ```python
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium'  
+    model: str = 'stabilityai/stable-diffusion-3.5-medium'  
     type_models: str = 't2im'  
     host: str = '0.0.0.0' 
     port: int = 8500
diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py
index 1723eb119849..4f114f93d63f 100644
--- a/examples/server-async/serverasync.py
+++ b/examples/server-async/serverasync.py
@@ -3,10 +3,9 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.concurrency import run_in_threadpool
 from pydantic import BaseModel
-from .Pipelines import ModelPipelineInitializer
-from .utils import Utils, RequestScopedPipeline
+from Pipelines import ModelPipelineInitializer
+from utils import Utils, RequestScopedPipeline
 import logging
-from diffusers import *
 import random
 from dataclasses import dataclass
 import os
@@ -20,7 +19,7 @@
 
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3-medium'  
+    model: str = 'stabilityai/stable-diffusion-3.5-medium'  
     type_models: str = 't2im'  
     constructor_pipeline: Optional[Type] = None
     custom_pipeline: Optional[Type] = None  

From edd550ba5dcaabafececbf049036203716b319ef Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 17 Sep 2025 10:34:58 -0600
Subject: [PATCH 29/34] Flux Pipeline Discard

---
 examples/server-async/Pipelines.py | 37 ------------------------------
 1 file changed, 37 deletions(-)

diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py
index d0012251da5d..c30669d26e99 100644
--- a/examples/server-async/Pipelines.py
+++ b/examples/server-async/Pipelines.py
@@ -1,5 +1,4 @@
 from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
-from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
 import torch
 import os
 import logging
@@ -20,7 +19,6 @@ class TextToImageInput(BaseModel):
 class PresetModels:
     SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
     SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium'])
-    Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell'])
 
 class TextToImagePipelineSD3:
     def __init__(self, model_path: str | None = None):
@@ -48,37 +46,6 @@ def start(self):
         else:
             raise Exception("No CUDA or MPS device available")
 
-class TextToImagePipelineFlux:
-    def __init__(self, model_path: str | None = None, low_vram: bool = False):
-        self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: FluxPipeline | None = None
-        self.device: str | None = None
-        self.low_vram = low_vram
-
-    def start(self):
-        if torch.cuda.is_available():
-            model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
-            logger.info("Loading CUDA")
-            self.device = "cuda" 
-            self.pipeline = FluxPipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.bfloat16,
-            ).to(device=self.device)
-            if self.low_vram:
-                self.pipeline.enable_model_cpu_offload()
-            else:
-                pass
-        elif torch.backends.mps.is_available():
-            model_path = self.model_path or "black-forest-labs/FLUX.1-schnell"
-            logger.info("Loading MPS for Mac M Series")
-            self.device = "mps"
-            self.pipeline = FluxPipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.bfloat16,
-            ).to(device=self.device)
-        else:
-            raise Exception("No CUDA or MPS device available")
-
 class ModelPipelineInitializer:
     def __init__(self, model: str = '', type_models: str = 't2im'):
         self.model = model
@@ -99,15 +66,11 @@ def initialize_pipeline(self):
             self.model_type = "SD3"
         elif self.model in preset_models.SD3_5:
             self.model_type = "SD3_5"
-        elif self.model in preset_models.Flux:
-            self.model_type = "Flux"
 
         # Create appropriate pipeline based on model type and type_models
         if self.type_models == 't2im':
             if self.model_type in ["SD3", "SD3_5"]:
                 self.pipeline = TextToImagePipelineSD3(self.model)
-            elif self.model_type == "Flux":
-                self.pipeline = TextToImagePipelineFlux(self.model)
             else:
                 raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
         elif self.type_models == 't2v':

From 6b693673e460ed1ae7a9a9b38437a5df323709ce Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Wed, 17 Sep 2025 12:05:11 -0600
Subject: [PATCH 30/34] Update examples/server-async/README.md

---
 examples/server-async/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/server-async/README.md b/examples/server-async/README.md
index d3feb9a092ab..a47ab7c7f224 100644
--- a/examples/server-async/README.md
+++ b/examples/server-async/README.md
@@ -1,11 +1,11 @@
 # Asynchronous server and parallel execution of models
 
-> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines.
+> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3 pipelines.
 > We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.)
 
 ## ⚠️ IMPORTANT
 
-* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU.
+* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` concurrently while keeping a single copy of the heavy model parameters on GPU.
 
 ## Necessary components
 
@@ -18,7 +18,7 @@ server-async/
 ├─────── scheduler.py              # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences
 ├─────── requestscopedpipeline.py  # RequestScoped Pipeline for inference with a single in-memory model
 ├─────── utils.py                  # Image/video saving utilities and service configuration
-├── Pipelines.py                   # pipeline loader classes (SD3, Flux, legacy SD, video)
+├── Pipelines.py                   # pipeline loader classes (SD3)
 ├── serverasync.py                 # FastAPI app with lifespan management and async inference endpoints
 ├── test.py                        # Client test script for inference requests
 ├── requirements.txt               # Dependencies

From 7c4f88348a8d3536a4568398e8f1f81cabab1ddc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Sep 2025 04:07:16 +0000
Subject: [PATCH 31/34] Apply style fixes

---
 examples/server-async/Pipelines.py            | 34 +++++---
 examples/server-async/serverasync.py          | 85 ++++++++++---------
 examples/server-async/test.py                 |  7 +-
 examples/server-async/utils/__init__.py       |  2 +-
 .../utils/requestscopedpipeline.py            | 47 +++++-----
 examples/server-async/utils/scheduler.py      | 20 +++--
 examples/server-async/utils/utils.py          | 14 +--
 7 files changed, 122 insertions(+), 87 deletions(-)

diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py
index c30669d26e99..f89cac6a7e4b 100644
--- a/examples/server-async/Pipelines.py
+++ b/examples/server-async/Pipelines.py
@@ -1,13 +1,17 @@
-from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
-import torch
-import os
 import logging
-from pydantic import BaseModel
-from dataclasses import dataclass,  field
+import os
+from dataclasses import dataclass, field
 from typing import List
 
+import torch
+from pydantic import BaseModel
+
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
+
+
 logger = logging.getLogger(__name__)
 
+
 class TextToImageInput(BaseModel):
     model: str
     prompt: str
@@ -17,8 +21,15 @@ class TextToImageInput(BaseModel):
 
 @dataclass
 class PresetModels:
-    SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium'])
-    SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium'])
+    SD3: List[str] = field(default_factory=lambda: ["stabilityai/stable-diffusion-3-medium"])
+    SD3_5: List[str] = field(
+        default_factory=lambda: [
+            "stabilityai/stable-diffusion-3.5-large",
+            "stabilityai/stable-diffusion-3.5-large-turbo",
+            "stabilityai/stable-diffusion-3.5-medium",
+        ]
+    )
+
 
 class TextToImagePipelineSD3:
     def __init__(self, model_path: str | None = None):
@@ -46,8 +57,9 @@ def start(self):
         else:
             raise Exception("No CUDA or MPS device available")
 
+
 class ModelPipelineInitializer:
-    def __init__(self, model: str = '', type_models: str = 't2im'):
+    def __init__(self, model: str = "", type_models: str = "t2im"):
         self.model = model
         self.type_models = type_models
         self.pipeline = None
@@ -68,12 +80,12 @@ def initialize_pipeline(self):
             self.model_type = "SD3_5"
 
         # Create appropriate pipeline based on model type and type_models
-        if self.type_models == 't2im':
+        if self.type_models == "t2im":
             if self.model_type in ["SD3", "SD3_5"]:
                 self.pipeline = TextToImagePipelineSD3(self.model)
             else:
                 raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
-        elif self.type_models == 't2v':
+        elif self.type_models == "t2v":
             raise ValueError(f"Unsupported type_models: {self.type_models}")
 
-        return self.pipeline
\ No newline at end of file
+        return self.pipeline
diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py
index 4f114f93d63f..b279b36f9a84 100644
--- a/examples/server-async/serverasync.py
+++ b/examples/server-async/serverasync.py
@@ -1,41 +1,45 @@
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import FileResponse  
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.concurrency import run_in_threadpool
-from pydantic import BaseModel
-from Pipelines import ModelPipelineInitializer
-from utils import Utils, RequestScopedPipeline
+import asyncio
+import gc
 import logging
-import random
-from dataclasses import dataclass
 import os
-import torch
+import random
 import threading
-import gc
-from typing import Optional, Dict, Any, Type
 from contextlib import asynccontextmanager
-import asyncio
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Type
+
+import torch
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.concurrency import run_in_threadpool
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from Pipelines import ModelPipelineInitializer
+from pydantic import BaseModel
+
+from utils import RequestScopedPipeline, Utils
 
 
 @dataclass
 class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3.5-medium'  
-    type_models: str = 't2im'  
+    model: str = "stabilityai/stable-diffusion-3.5-medium"
+    type_models: str = "t2im"
     constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None  
+    custom_pipeline: Optional[Type] = None
     components: Optional[Dict[str, Any]] = None
     torch_dtype: Optional[torch.dtype] = None
-    host: str = '0.0.0.0' 
+    host: str = "0.0.0.0"
     port: int = 8500
 
+
 server_config = ServerConfigModels()
 
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logging.basicConfig(level=logging.INFO)
     app.state.logger = logging.getLogger("diffusers-server")
-    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
-    os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
 
     app.state.total_requests = 0
     app.state.active_inferences = 0
@@ -81,12 +85,12 @@ async def metrics_loop():
 
         app.state.logger.info("Lifespan shutdown complete")
 
+
 app = FastAPI(lifespan=lifespan)
 
 logger = logging.getLogger("DiffusersServer.Pipelines")
 
 
-        
 initializer = ModelPipelineInitializer(
     model=server_config.model,
     type_models=server_config.type_models,
@@ -104,12 +108,14 @@ async def metrics_loop():
 app.state.REQUEST_PIPE = request_pipe
 app.state.PIPELINE_LOCK = pipeline_lock
 
+
 class JSONBodyQueryAPI(BaseModel):
-    model : str | None = None
-    prompt : str
-    negative_prompt : str | None = None
-    num_inference_steps : int = 28
-    num_images_per_prompt : int = 1
+    model: str | None = None
+    prompt: str
+    negative_prompt: str | None = None
+    num_inference_steps: int = 28
+    num_images_per_prompt: int = 1
+
 
 @app.middleware("http")
 async def count_requests_middleware(request: Request, call_next):
@@ -123,25 +129,24 @@ async def count_requests_middleware(request: Request, call_next):
 async def root():
     return {"message": "Welcome to the Diffusers Server"}
 
+
 @app.post("/api/diffusers/inference")
 async def api(json: JSONBodyQueryAPI):
-    prompt                = json.prompt
-    negative_prompt       = json.negative_prompt or ""
-    num_steps             = json.num_inference_steps
+    prompt = json.prompt
+    negative_prompt = json.negative_prompt or ""
+    num_steps = json.num_inference_steps
     num_images_per_prompt = json.num_images_per_prompt
 
-    wrapper     = app.state.MODEL_PIPELINE   
+    wrapper = app.state.MODEL_PIPELINE
     initializer = app.state.MODEL_INITIALIZER
 
     utils_app = app.state.utils_app
 
-
     if not wrapper or not wrapper.pipeline:
         raise HTTPException(500, "Model not initialized correctly")
     if not prompt.strip():
         raise HTTPException(400, "No prompt provided")
 
-
     def make_generator():
         g = torch.Generator(device=initializer.device)
         return g.manual_seed(random.randint(0, 10_000_000))
@@ -168,7 +173,7 @@ def infer():
 
         async with app.state.metrics_lock:
             app.state.active_inferences = max(0, app.state.active_inferences - 1)
-                
+
         urls = [utils_app.save_image(img) for img in output.images]
         return {"response": urls}
 
@@ -195,27 +200,25 @@ async def serve_image(filename: str):
         raise HTTPException(status_code=404, detail="Image not found")
     return FileResponse(file_path, media_type="image/png")
 
+
 @app.get("/api/status")
 async def get_status():
     memory_info = {}
     if torch.cuda.is_available():
         memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
-        memory_reserved = torch.cuda.memory_reserved() / 1024**3    # GB
+        memory_reserved = torch.cuda.memory_reserved() / 1024**3  # GB
         memory_info = {
             "memory_allocated_gb": round(memory_allocated, 2),
             "memory_reserved_gb": round(memory_reserved, 2),
-            "device": torch.cuda.get_device_name(0)
+            "device": torch.cuda.get_device_name(0),
         }
 
-    return {
-        "current_model" : server_config.model,
-        "type_models" : server_config.type_models,
-        "memory" : memory_info}
-        
+    return {"current_model": server_config.model, "type_models": server_config.type_models, "memory": memory_info}
+
 
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"], 
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
@@ -224,4 +227,4 @@ async def get_status():
 if __name__ == "__main__":
     import uvicorn
 
-    uvicorn.run(app, host=server_config.host, port=server_config.port)
\ No newline at end of file
+    uvicorn.run(app, host=server_config.host, port=server_config.port)
diff --git a/examples/server-async/test.py b/examples/server-async/test.py
index 2c27146d0bd0..e67317ea8f6b 100644
--- a/examples/server-async/test.py
+++ b/examples/server-async/test.py
@@ -1,8 +1,10 @@
 import os
 import time
 import urllib.parse
+
 import requests
 
+
 SERVER_URL = "http://localhost:8500/api/diffusers/inference"
 BASE_URL = "http://localhost:8500"
 DOWNLOAD_FOLDER = "generated_images"
@@ -10,6 +12,7 @@
 
 os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
 
+
 def save_from_url(url: str) -> str:
     """Download the given URL (relative or absolute) and save it locally."""
     if url.startswith("/"):
@@ -24,11 +27,12 @@ def save_from_url(url: str) -> str:
         f.write(resp.content)
     return path
 
+
 def main():
     payload = {
         "prompt": "The T-800 Terminator Robot Returning From The Future, Anime Style",
         "num_inference_steps": 30,
-        "num_images_per_prompt": 1
+        "num_images_per_prompt": 1,
     }
 
     print("Sending request...")
@@ -56,5 +60,6 @@ def main():
         except Exception as e:
             print(f"Error downloading {u}: {e}")
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py
index 741cd9bb0219..731cfe491ae5 100644
--- a/examples/server-async/utils/__init__.py
+++ b/examples/server-async/utils/__init__.py
@@ -1,2 +1,2 @@
 from .requestscopedpipeline import RequestScopedPipeline
-from .utils import Utils
\ No newline at end of file
+from .utils import Utils
diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
index 79f79e28f5e7..57d1e2567169 100644
--- a/examples/server-async/utils/requestscopedpipeline.py
+++ b/examples/server-async/utils/requestscopedpipeline.py
@@ -1,17 +1,22 @@
-from typing import Optional, Any, Iterable, List
 import copy
 import threading
+from typing import Any, Iterable, List, Optional
+
 import torch
+
 from diffusers.utils import logging
+
 from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
 
 
 logger = logging.get_logger(__name__)
 
+
 def safe_tokenize(tokenizer, *args, lock, **kwargs):
     with lock:
         return tokenizer(*args, **kwargs)
 
+
 class RequestScopedPipeline:
     DEFAULT_MUTABLE_ATTRS = [
         "_all_hooks",
@@ -30,7 +35,7 @@ def __init__(
         auto_detect_mutables: bool = True,
         tensor_numel_threshold: int = 1_000_000,
         tokenizer_lock: Optional[threading.Lock] = None,
-        wrap_scheduler: bool = True
+        wrap_scheduler: bool = True,
     ):
         self._base = pipeline
         self.unet = getattr(pipeline, "unet", None)
@@ -38,7 +43,7 @@ def __init__(
         self.text_encoder = getattr(pipeline, "text_encoder", None)
         self.components = getattr(pipeline, "components", None)
 
-        if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None:
+        if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None:
             if not isinstance(pipeline.scheduler, BaseAsyncScheduler):
                 pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler)
 
@@ -62,9 +67,7 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
 
         try:
             return wrapped_scheduler.clone_for_request(
-                num_inference_steps=num_inference_steps, 
-                device=device, 
-                **clone_kwargs
+                num_inference_steps=num_inference_steps, device=device, **clone_kwargs
             )
         except Exception as e:
             logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
@@ -72,7 +75,7 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
                 return copy.deepcopy(wrapped_scheduler)
             except Exception as e:
                 logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
-                return wrapped_scheduler  
+                return wrapped_scheduler
 
     def _autodetect_mutables(self, max_attrs: int = 40):
         if not self._auto_detect_mutables:
@@ -140,7 +143,9 @@ def _clone_mutable_attrs(self, base, local):
         attrs_to_clone = list(self._mutable_attrs)
         attrs_to_clone.extend(self._autodetect_mutables())
 
-        EXCLUDE_ATTRS = {"components",}
+        EXCLUDE_ATTRS = {
+            "components",
+        }
 
         for attr in attrs_to_clone:
             if attr in EXCLUDE_ATTRS:
@@ -188,16 +193,16 @@ def _clone_mutable_attrs(self, base, local):
     def _is_tokenizer_component(self, component) -> bool:
         if component is None:
             return False
-        
-        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
+
+        tokenizer_methods = ["encode", "decode", "tokenize", "__call__"]
         has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
-        
+
         class_name = component.__class__.__name__.lower()
-        has_tokenizer_in_name = 'tokenizer' in class_name
-        
-        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
+        has_tokenizer_in_name = "tokenizer" in class_name
+
+        tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"]
         has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
-        
+
         return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
 
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
@@ -216,7 +221,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                     num_inference_steps=num_inference_steps,
                     device=device,
                     return_scheduler=True,
-                    **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']}
+                    **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]},
                 )
 
                 final_scheduler = BaseAsyncScheduler(configured_scheduler)
@@ -238,7 +243,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                         setattr(
                             local_pipe,
                             name,
-                            lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs)
+                            lambda *args, tok=tok, **kwargs: safe_tokenize(
+                                tok, *args, lock=self._tokenizer_lock, **kwargs
+                            ),
                         )
 
             # b) wrap tokenizers in components dict
@@ -246,7 +253,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                 for key, val in local_pipe.components.items():
                     if val is None:
                         continue
-                    
+
                     if self._is_tokenizer_component(val):
                         tokenizer_wrappers[f"components[{key}]"] = val
                         local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
@@ -281,9 +288,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             try:
                 for name, tok in tokenizer_wrappers.items():
                     if name.startswith("components["):
-                        key = name[len("components["):-1]
+                        key = name[len("components[") : -1]
                         local_pipe.components[key] = tok
                     else:
                         setattr(local_pipe, name, tok)
             except Exception as e:
-                logger.debug(f"Error restoring wrapped tokenizers: {e}")
\ No newline at end of file
+                logger.debug(f"Error restoring wrapped tokenizers: {e}")
diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/utils/scheduler.py
index 5925edfeab04..86d47cac6154 100644
--- a/examples/server-async/utils/scheduler.py
+++ b/examples/server-async/utils/scheduler.py
@@ -1,7 +1,9 @@
-from typing import Any, Optional, Union, List
-import torch
 import copy
 import inspect
+from typing import Any, List, Optional, Union
+
+import torch
+
 
 class BaseAsyncScheduler:
     def __init__(self, scheduler: Any):
@@ -11,12 +13,12 @@ def __getattr__(self, name: str):
         if hasattr(self.scheduler, name):
             return getattr(self.scheduler, name)
         raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
-    
+
     def __setattr__(self, name: str, value):
-        if name == 'scheduler':
+        if name == "scheduler":
             super().__setattr__(name, value)
         else:
-            if hasattr(self, 'scheduler') and hasattr(self.scheduler, name):
+            if hasattr(self, "scheduler") and hasattr(self.scheduler, name):
                 setattr(self.scheduler, name, value)
             else:
                 super().__setattr__(name, value)
@@ -29,7 +31,7 @@ def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.d
 
     def __repr__(self):
         return f"BaseAsyncScheduler({repr(self.scheduler)})"
-    
+
     def __str__(self):
         return f"BaseAsyncScheduler wrapping: {str(self.scheduler)}"
 
@@ -91,7 +93,9 @@ def async_retrieve_timesteps(
         if hasattr(scheduler, "clone_for_request"):
             try:
                 # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device)
+                scheduler_in_use = scheduler.clone_for_request(
+                    num_inference_steps=num_inference_steps or 0, device=device
+                )
             except Exception:
                 scheduler_in_use = copy.deepcopy(scheduler)
         else:
@@ -134,4 +138,4 @@ def _accepts(param_name: str) -> bool:
 
     if return_scheduler:
         return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
\ No newline at end of file
+    return timesteps_out, num_inference_steps
diff --git a/examples/server-async/utils/utils.py b/examples/server-async/utils/utils.py
index e3dbb45677e1..9f943305126c 100644
--- a/examples/server-async/utils/utils.py
+++ b/examples/server-async/utils/utils.py
@@ -1,14 +1,17 @@
+import gc
+import logging
 import os
 import tempfile
-import torch
 import uuid
-import gc
-import logging
+
+import torch
+
 
 logger = logging.getLogger(__name__)
 
+
 class Utils:
-    def __init__(self, host: str = '0.0.0.0', port: int = 8500):
+    def __init__(self, host: str = "0.0.0.0", port: int = 8500):
         self.service_url = f"http://{host}:{port}"
         self.image_dir = os.path.join(tempfile.gettempdir(), "images")
         if not os.path.exists(self.image_dir):
@@ -27,6 +30,7 @@ def save_image(self, image):
 
         if isinstance(image, torch.Tensor):
             from torchvision import transforms
+
             to_pil = transforms.ToPILImage()
             image = to_pil(image.squeeze(0).clamp(0, 1))
 
@@ -41,4 +45,4 @@ def save_image(self, image):
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 
-        return os.path.join(self.service_url, "images", filename)
\ No newline at end of file
+        return os.path.join(self.service_url, "images", filename)

From f2e9f0242db51de66e7692f5adcefa9b1b7c1435 Mon Sep 17 00:00:00 2001
From: Fredy Rivera <fredyriveraacevedo13@gmail.com>
Date: Mon, 20 Oct 2025 18:11:52 -0600
Subject: [PATCH 32/34] Add thread-safe wrappers for components in pipeline

Refactor requestscopedpipeline.py to add thread-safe wrappers for tokenizer, VAE, and image processor. Introduce locking mechanisms to ensure thread safety during concurrent access.
---
 .../utils/requestscopedpipeline.py            | 242 +++++++++++++-----
 1 file changed, 173 insertions(+), 69 deletions(-)

diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
index 57d1e2567169..0b0e73ca04e2 100644
--- a/examples/server-async/utils/requestscopedpipeline.py
+++ b/examples/server-async/utils/requestscopedpipeline.py
@@ -1,26 +1,92 @@
+from typing import Optional, Any, Iterable, List
 import copy
 import threading
-from typing import Any, Iterable, List, Optional
-
 import torch
-
 from diffusers.utils import logging
-
 from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
 
-
 logger = logging.get_logger(__name__)
 
+class ThreadSafeTokenizerWrapper:
+    def __init__(self, tokenizer, lock):
+        self._tokenizer = tokenizer
+        self._lock = lock
 
-def safe_tokenize(tokenizer, *args, lock, **kwargs):
-    with lock:
-        return tokenizer(*args, **kwargs)
-
+        self._thread_safe_methods = {
+            '__call__', 'encode', 'decode', 'tokenize', 
+            'encode_plus', 'batch_encode_plus', 'batch_decode'
+        }
+    
+    def __getattr__(self, name):
+        attr = getattr(self._tokenizer, name)
+        
+        if name in self._thread_safe_methods and callable(attr):
+            def wrapped_method(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped_method
+        
+        return attr
+
+    def __call__(self, *args, **kwargs):
+        with self._lock:
+            return self._tokenizer(*args, **kwargs)
+    
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._tokenizer, name, value)
+    
+    def __dir__(self):
+        return dir(self._tokenizer)
+
+
+class ThreadSafeVAEWrapper:
+    def __init__(self, vae, lock):
+        self._vae = vae
+        self._lock = lock
+
+    def __getattr__(self, name):
+        attr = getattr(self._vae, name)
+        # métodos que queremos proteger
+        if name in {"decode", "encode", "forward"} and callable(attr):
+            def wrapped(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped
+        return attr
+
+    def __setattr__(self, name, value):
+        if name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._vae, name, value)
+
+class ThreadSafeImageProcessorWrapper:
+    def __init__(self, proc, lock):
+        self._proc = proc
+        self._lock = lock
+
+    def __getattr__(self, name):
+        attr = getattr(self._proc, name)
+        if name in {"postprocess", "preprocess"} and callable(attr):
+            def wrapped(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped
+        return attr
+
+    def __setattr__(self, name, value):
+        if name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._proc, name, value)
 
 class RequestScopedPipeline:
     DEFAULT_MUTABLE_ATTRS = [
         "_all_hooks",
-        "_offload_device",
+        "_offload_device", 
         "_progress_bar_config",
         "_progress_bar",
         "_rng_state",
@@ -38,23 +104,43 @@ def __init__(
         wrap_scheduler: bool = True,
     ):
         self._base = pipeline
+        
+        
         self.unet = getattr(pipeline, "unet", None)
-        self.vae = getattr(pipeline, "vae", None)
+        self.vae = getattr(pipeline, "vae", None) 
         self.text_encoder = getattr(pipeline, "text_encoder", None)
         self.components = getattr(pipeline, "components", None)
-
-        if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None:
+        
+        self.transformer = getattr(pipeline, "transformer", None)
+        
+        if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None:
             if not isinstance(pipeline.scheduler, BaseAsyncScheduler):
                 pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler)
 
         self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
+        
+        
         self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
 
+        self._vae_lock = threading.Lock()
+        self._image_lock = threading.Lock()
+        
         self._auto_detect_mutables = bool(auto_detect_mutables)
         self._tensor_numel_threshold = int(tensor_numel_threshold)
-
         self._auto_detected_attrs: List[str] = []
 
+    def _detect_kernel_pipeline(self, pipeline) -> bool:
+        kernel_indicators = [
+            'text_encoding_cache',
+            'memory_manager', 
+            'enable_optimizations',
+            '_create_request_context',
+            'get_optimization_stats'
+        ]
+        
+        return any(hasattr(pipeline, attr) for attr in kernel_indicators)
+
+
     def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
         base_sched = getattr(self._base, "scheduler", None)
         if base_sched is None:
@@ -67,15 +153,25 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
 
         try:
             return wrapped_scheduler.clone_for_request(
-                num_inference_steps=num_inference_steps, device=device, **clone_kwargs
+                num_inference_steps=num_inference_steps, 
+                device=device, 
+                **clone_kwargs
             )
         except Exception as e:
-            logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
+            logger.debug(f"clone_for_request failed: {e}; trying shallow copy fallback")
             try:
-                return copy.deepcopy(wrapped_scheduler)
-            except Exception as e:
-                logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
-                return wrapped_scheduler
+                if hasattr(wrapped_scheduler, 'scheduler'):
+                    try:
+                        copied_scheduler = copy.copy(wrapped_scheduler.scheduler)
+                        return BaseAsyncScheduler(copied_scheduler)
+                    except Exception:
+                        return wrapped_scheduler
+                else:
+                    copied_scheduler = copy.copy(wrapped_scheduler)
+                    return BaseAsyncScheduler(copied_scheduler)
+            except Exception as e2:
+                logger.warning(f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*).")
+                return wrapped_scheduler 
 
     def _autodetect_mutables(self, max_attrs: int = 40):
         if not self._auto_detect_mutables:
@@ -86,6 +182,8 @@ def _autodetect_mutables(self, max_attrs: int = 40):
 
         candidates: List[str] = []
         seen = set()
+        
+        
         for name in dir(self._base):
             if name.startswith("__"):
                 continue
@@ -93,6 +191,7 @@ def _autodetect_mutables(self, max_attrs: int = 40):
                 continue
             if name in ("to", "save_pretrained", "from_pretrained"):
                 continue
+                
             try:
                 val = getattr(self._base, name)
             except Exception:
@@ -100,11 +199,9 @@ def _autodetect_mutables(self, max_attrs: int = 40):
 
             import types
 
-            # skip callables and modules
             if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)):
                 continue
 
-            # containers -> candidate
             if isinstance(val, (dict, list, set, tuple, bytearray)):
                 candidates.append(name)
                 seen.add(name)
@@ -143,9 +240,7 @@ def _clone_mutable_attrs(self, base, local):
         attrs_to_clone = list(self._mutable_attrs)
         attrs_to_clone.extend(self._autodetect_mutables())
 
-        EXCLUDE_ATTRS = {
-            "components",
-        }
+        EXCLUDE_ATTRS = {"components",}
 
         for attr in attrs_to_clone:
             if attr in EXCLUDE_ATTRS:
@@ -193,18 +288,21 @@ def _clone_mutable_attrs(self, base, local):
     def _is_tokenizer_component(self, component) -> bool:
         if component is None:
             return False
-
-        tokenizer_methods = ["encode", "decode", "tokenize", "__call__"]
+        
+        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
         has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
-
+        
         class_name = component.__class__.__name__.lower()
-        has_tokenizer_in_name = "tokenizer" in class_name
-
-        tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"]
+        has_tokenizer_in_name = 'tokenizer' in class_name
+        
+        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
         has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
-
+        
         return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
 
+    def _should_wrap_tokenizers(self) -> bool:
+        return True
+
     def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
         local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
 
@@ -214,6 +312,15 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
             local_pipe = copy.deepcopy(self._base)
 
+        try:
+            if hasattr(local_pipe, "vae") and local_pipe.vae is not None and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper):
+                local_pipe.vae = ThreadSafeVAEWrapper(local_pipe.vae, self._vae_lock)
+
+            if hasattr(local_pipe, "image_processor") and local_pipe.image_processor is not None and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper):
+                local_pipe.image_processor = ThreadSafeImageProcessorWrapper(local_pipe.image_processor, self._image_lock)
+        except Exception as e:
+            logger.debug(f"Could not wrap vae/image_processor: {e}")
+
         if local_scheduler is not None:
             try:
                 timesteps, num_steps, configured_scheduler = async_retrieve_timesteps(
@@ -221,7 +328,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                     num_inference_steps=num_inference_steps,
                     device=device,
                     return_scheduler=True,
-                    **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]},
+                    **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']}
                 )
 
                 final_scheduler = BaseAsyncScheduler(configured_scheduler)
@@ -230,48 +337,45 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                 logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
 
         self._clone_mutable_attrs(self._base, local_pipe)
+        
 
-        # 4) wrap tokenizers on the local pipe with the lock wrapper
-        tokenizer_wrappers = {}  # name -> original_tokenizer
-        try:
-            # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
-            for name in dir(local_pipe):
-                if "tokenizer" in name and not name.startswith("_"):
-                    tok = getattr(local_pipe, name, None)
-                    if tok is not None and self._is_tokenizer_component(tok):
-                        tokenizer_wrappers[name] = tok
-                        setattr(
-                            local_pipe,
-                            name,
-                            lambda *args, tok=tok, **kwargs: safe_tokenize(
-                                tok, *args, lock=self._tokenizer_lock, **kwargs
-                            ),
-                        )
-
-            # b) wrap tokenizers in components dict
-            if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
-                for key, val in local_pipe.components.items():
-                    if val is None:
-                        continue
-
-                    if self._is_tokenizer_component(val):
-                        tokenizer_wrappers[f"components[{key}]"] = val
-                        local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
-                            tokenizer, *args, lock=self._tokenizer_lock, **kwargs
-                        )
+        original_tokenizers = {}
+        
+        if self._should_wrap_tokenizers():
+            try:
+                for name in dir(local_pipe):
+                    if "tokenizer" in name and not name.startswith("_"):
+                        tok = getattr(local_pipe, name, None)
+                        if tok is not None and self._is_tokenizer_component(tok):
+                            if not isinstance(tok, ThreadSafeTokenizerWrapper):
+                                original_tokenizers[name] = tok
+                                wrapped_tokenizer = ThreadSafeTokenizerWrapper(tok, self._tokenizer_lock)
+                                setattr(local_pipe, name, wrapped_tokenizer)
+
+                if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
+                    for key, val in local_pipe.components.items():
+                        if val is None:
+                            continue
+                        
+                        if self._is_tokenizer_component(val):
+                            if not isinstance(val, ThreadSafeTokenizerWrapper):
+                                original_tokenizers[f"components[{key}]"] = val
+                                wrapped_tokenizer = ThreadSafeTokenizerWrapper(val, self._tokenizer_lock)
+                                local_pipe.components[key] = wrapped_tokenizer
 
-        except Exception as e:
-            logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
+            except Exception as e:
+                logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
 
         result = None
         cm = getattr(local_pipe, "model_cpu_offload_context", None)
+        
         try:
+            
             if callable(cm):
                 try:
                     with cm():
                         result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
                 except TypeError:
-                    # cm might be a context manager instance rather than callable
                     try:
                         with cm:
                             result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
@@ -279,18 +383,18 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                         logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
                         result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
             else:
-                # no offload context available — call directly
                 result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
 
             return result
 
         finally:
             try:
-                for name, tok in tokenizer_wrappers.items():
+                for name, tok in original_tokenizers.items():
                     if name.startswith("components["):
-                        key = name[len("components[") : -1]
-                        local_pipe.components[key] = tok
+                        key = name[len("components["):-1]
+                        if hasattr(local_pipe, 'components') and isinstance(local_pipe.components, dict):
+                            local_pipe.components[key] = tok
                     else:
                         setattr(local_pipe, name, tok)
             except Exception as e:
-                logger.debug(f"Error restoring wrapped tokenizers: {e}")
+                logger.debug(f"Error restoring original tokenizers: {e}")

From 489da5d5139119caf915ab7ef37537ffe89c6806 Mon Sep 17 00:00:00 2001
From: F4k3r22 <fredyriveraacevedo13@gmail.com>
Date: Mon, 20 Oct 2025 18:58:34 -0600
Subject: [PATCH 33/34] Add wrappers.py

---
 .../utils/requestscopedpipeline.py            | 77 +------------------
 examples/server-async/utils/wrappers.py       | 74 ++++++++++++++++++
 2 files changed, 75 insertions(+), 76 deletions(-)
 create mode 100644 examples/server-async/utils/wrappers.py

diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
index 0b0e73ca04e2..c5acc35d5fab 100644
--- a/examples/server-async/utils/requestscopedpipeline.py
+++ b/examples/server-async/utils/requestscopedpipeline.py
@@ -4,85 +4,10 @@
 import torch
 from diffusers.utils import logging
 from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
+from .wrappers import ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper, ThreadSafeImageProcessorWrapper
 
 logger = logging.get_logger(__name__)
 
-class ThreadSafeTokenizerWrapper:
-    def __init__(self, tokenizer, lock):
-        self._tokenizer = tokenizer
-        self._lock = lock
-
-        self._thread_safe_methods = {
-            '__call__', 'encode', 'decode', 'tokenize', 
-            'encode_plus', 'batch_encode_plus', 'batch_decode'
-        }
-    
-    def __getattr__(self, name):
-        attr = getattr(self._tokenizer, name)
-        
-        if name in self._thread_safe_methods and callable(attr):
-            def wrapped_method(*args, **kwargs):
-                with self._lock:
-                    return attr(*args, **kwargs)
-            return wrapped_method
-        
-        return attr
-
-    def __call__(self, *args, **kwargs):
-        with self._lock:
-            return self._tokenizer(*args, **kwargs)
-    
-    def __setattr__(self, name, value):
-        if name.startswith('_'):
-            super().__setattr__(name, value)
-        else:
-            setattr(self._tokenizer, name, value)
-    
-    def __dir__(self):
-        return dir(self._tokenizer)
-
-
-class ThreadSafeVAEWrapper:
-    def __init__(self, vae, lock):
-        self._vae = vae
-        self._lock = lock
-
-    def __getattr__(self, name):
-        attr = getattr(self._vae, name)
-        # métodos que queremos proteger
-        if name in {"decode", "encode", "forward"} and callable(attr):
-            def wrapped(*args, **kwargs):
-                with self._lock:
-                    return attr(*args, **kwargs)
-            return wrapped
-        return attr
-
-    def __setattr__(self, name, value):
-        if name.startswith("_"):
-            super().__setattr__(name, value)
-        else:
-            setattr(self._vae, name, value)
-
-class ThreadSafeImageProcessorWrapper:
-    def __init__(self, proc, lock):
-        self._proc = proc
-        self._lock = lock
-
-    def __getattr__(self, name):
-        attr = getattr(self._proc, name)
-        if name in {"postprocess", "preprocess"} and callable(attr):
-            def wrapped(*args, **kwargs):
-                with self._lock:
-                    return attr(*args, **kwargs)
-            return wrapped
-        return attr
-
-    def __setattr__(self, name, value):
-        if name.startswith("_"):
-            super().__setattr__(name, value)
-        else:
-            setattr(self._proc, name, value)
-
 class RequestScopedPipeline:
     DEFAULT_MUTABLE_ATTRS = [
         "_all_hooks",
diff --git a/examples/server-async/utils/wrappers.py b/examples/server-async/utils/wrappers.py
new file mode 100644
index 000000000000..5130f175c2b1
--- /dev/null
+++ b/examples/server-async/utils/wrappers.py
@@ -0,0 +1,74 @@
+class ThreadSafeTokenizerWrapper:
+    def __init__(self, tokenizer, lock):
+        self._tokenizer = tokenizer
+        self._lock = lock
+
+        self._thread_safe_methods = {
+            '__call__', 'encode', 'decode', 'tokenize', 
+            'encode_plus', 'batch_encode_plus', 'batch_decode'
+        }
+    
+    def __getattr__(self, name):
+        attr = getattr(self._tokenizer, name)
+        
+        if name in self._thread_safe_methods and callable(attr):
+            def wrapped_method(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped_method
+        
+        return attr
+
+    def __call__(self, *args, **kwargs):
+        with self._lock:
+            return self._tokenizer(*args, **kwargs)
+    
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._tokenizer, name, value)
+    
+    def __dir__(self):
+        return dir(self._tokenizer)
+
+
+class ThreadSafeVAEWrapper:
+    def __init__(self, vae, lock):
+        self._vae = vae
+        self._lock = lock
+
+    def __getattr__(self, name):
+        attr = getattr(self._vae, name)
+        if name in {"decode", "encode", "forward"} and callable(attr):
+            def wrapped(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped
+        return attr
+
+    def __setattr__(self, name, value):
+        if name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._vae, name, value)
+
+class ThreadSafeImageProcessorWrapper:
+    def __init__(self, proc, lock):
+        self._proc = proc
+        self._lock = lock
+
+    def __getattr__(self, name):
+        attr = getattr(self._proc, name)
+        if name in {"postprocess", "preprocess"} and callable(attr):
+            def wrapped(*args, **kwargs):
+                with self._lock:
+                    return attr(*args, **kwargs)
+            return wrapped
+        return attr
+
+    def __setattr__(self, name, value):
+        if name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            setattr(self._proc, name, value)
\ No newline at end of file

From 581847fa6940bdb73a72752cba508be7131a0dc7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 21 Oct 2025 05:25:17 +0000
Subject: [PATCH 34/34] Apply style fixes

---
 .../utils/requestscopedpipeline.py            | 105 ++++++++++--------
 examples/server-async/utils/wrappers.py       |  30 +++--
 2 files changed, 79 insertions(+), 56 deletions(-)

diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py
index c5acc35d5fab..9c3276c31c69 100644
--- a/examples/server-async/utils/requestscopedpipeline.py
+++ b/examples/server-async/utils/requestscopedpipeline.py
@@ -1,17 +1,22 @@
-from typing import Optional, Any, Iterable, List
 import copy
 import threading
+from typing import Any, Iterable, List, Optional
+
 import torch
+
 from diffusers.utils import logging
+
 from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
-from .wrappers import ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper, ThreadSafeImageProcessorWrapper
+from .wrappers import ThreadSafeImageProcessorWrapper, ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper
+
 
 logger = logging.get_logger(__name__)
 
+
 class RequestScopedPipeline:
     DEFAULT_MUTABLE_ATTRS = [
         "_all_hooks",
-        "_offload_device", 
+        "_offload_device",
         "_progress_bar_config",
         "_progress_bar",
         "_rng_state",
@@ -29,42 +34,39 @@ def __init__(
         wrap_scheduler: bool = True,
     ):
         self._base = pipeline
-        
-        
+
         self.unet = getattr(pipeline, "unet", None)
-        self.vae = getattr(pipeline, "vae", None) 
+        self.vae = getattr(pipeline, "vae", None)
         self.text_encoder = getattr(pipeline, "text_encoder", None)
         self.components = getattr(pipeline, "components", None)
-        
+
         self.transformer = getattr(pipeline, "transformer", None)
-        
-        if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None:
+
+        if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None:
             if not isinstance(pipeline.scheduler, BaseAsyncScheduler):
                 pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler)
 
         self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
-        
-        
+
         self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
 
         self._vae_lock = threading.Lock()
         self._image_lock = threading.Lock()
-        
+
         self._auto_detect_mutables = bool(auto_detect_mutables)
         self._tensor_numel_threshold = int(tensor_numel_threshold)
         self._auto_detected_attrs: List[str] = []
 
     def _detect_kernel_pipeline(self, pipeline) -> bool:
         kernel_indicators = [
-            'text_encoding_cache',
-            'memory_manager', 
-            'enable_optimizations',
-            '_create_request_context',
-            'get_optimization_stats'
+            "text_encoding_cache",
+            "memory_manager",
+            "enable_optimizations",
+            "_create_request_context",
+            "get_optimization_stats",
         ]
-        
-        return any(hasattr(pipeline, attr) for attr in kernel_indicators)
 
+        return any(hasattr(pipeline, attr) for attr in kernel_indicators)
 
     def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
         base_sched = getattr(self._base, "scheduler", None)
@@ -78,14 +80,12 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
 
         try:
             return wrapped_scheduler.clone_for_request(
-                num_inference_steps=num_inference_steps, 
-                device=device, 
-                **clone_kwargs
+                num_inference_steps=num_inference_steps, device=device, **clone_kwargs
             )
         except Exception as e:
             logger.debug(f"clone_for_request failed: {e}; trying shallow copy fallback")
             try:
-                if hasattr(wrapped_scheduler, 'scheduler'):
+                if hasattr(wrapped_scheduler, "scheduler"):
                     try:
                         copied_scheduler = copy.copy(wrapped_scheduler.scheduler)
                         return BaseAsyncScheduler(copied_scheduler)
@@ -95,8 +95,10 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str]
                     copied_scheduler = copy.copy(wrapped_scheduler)
                     return BaseAsyncScheduler(copied_scheduler)
             except Exception as e2:
-                logger.warning(f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*).")
-                return wrapped_scheduler 
+                logger.warning(
+                    f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*)."
+                )
+                return wrapped_scheduler
 
     def _autodetect_mutables(self, max_attrs: int = 40):
         if not self._auto_detect_mutables:
@@ -107,8 +109,7 @@ def _autodetect_mutables(self, max_attrs: int = 40):
 
         candidates: List[str] = []
         seen = set()
-        
-        
+
         for name in dir(self._base):
             if name.startswith("__"):
                 continue
@@ -116,7 +117,7 @@ def _autodetect_mutables(self, max_attrs: int = 40):
                 continue
             if name in ("to", "save_pretrained", "from_pretrained"):
                 continue
-                
+
             try:
                 val = getattr(self._base, name)
             except Exception:
@@ -165,7 +166,9 @@ def _clone_mutable_attrs(self, base, local):
         attrs_to_clone = list(self._mutable_attrs)
         attrs_to_clone.extend(self._autodetect_mutables())
 
-        EXCLUDE_ATTRS = {"components",}
+        EXCLUDE_ATTRS = {
+            "components",
+        }
 
         for attr in attrs_to_clone:
             if attr in EXCLUDE_ATTRS:
@@ -213,16 +216,16 @@ def _clone_mutable_attrs(self, base, local):
     def _is_tokenizer_component(self, component) -> bool:
         if component is None:
             return False
-        
-        tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__']
+
+        tokenizer_methods = ["encode", "decode", "tokenize", "__call__"]
         has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
-        
+
         class_name = component.__class__.__name__.lower()
-        has_tokenizer_in_name = 'tokenizer' in class_name
-        
-        tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token']
+        has_tokenizer_in_name = "tokenizer" in class_name
+
+        tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"]
         has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
-        
+
         return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
 
     def _should_wrap_tokenizers(self) -> bool:
@@ -238,11 +241,21 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             local_pipe = copy.deepcopy(self._base)
 
         try:
-            if hasattr(local_pipe, "vae") and local_pipe.vae is not None and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper):
+            if (
+                hasattr(local_pipe, "vae")
+                and local_pipe.vae is not None
+                and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper)
+            ):
                 local_pipe.vae = ThreadSafeVAEWrapper(local_pipe.vae, self._vae_lock)
 
-            if hasattr(local_pipe, "image_processor") and local_pipe.image_processor is not None and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper):
-                local_pipe.image_processor = ThreadSafeImageProcessorWrapper(local_pipe.image_processor, self._image_lock)
+            if (
+                hasattr(local_pipe, "image_processor")
+                and local_pipe.image_processor is not None
+                and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper)
+            ):
+                local_pipe.image_processor = ThreadSafeImageProcessorWrapper(
+                    local_pipe.image_processor, self._image_lock
+                )
         except Exception as e:
             logger.debug(f"Could not wrap vae/image_processor: {e}")
 
@@ -253,7 +266,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                     num_inference_steps=num_inference_steps,
                     device=device,
                     return_scheduler=True,
-                    **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']}
+                    **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]},
                 )
 
                 final_scheduler = BaseAsyncScheduler(configured_scheduler)
@@ -262,10 +275,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                 logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
 
         self._clone_mutable_attrs(self._base, local_pipe)
-        
 
         original_tokenizers = {}
-        
+
         if self._should_wrap_tokenizers():
             try:
                 for name in dir(local_pipe):
@@ -281,7 +293,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
                     for key, val in local_pipe.components.items():
                         if val is None:
                             continue
-                        
+
                         if self._is_tokenizer_component(val):
                             if not isinstance(val, ThreadSafeTokenizerWrapper):
                                 original_tokenizers[f"components[{key}]"] = val
@@ -293,9 +305,8 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
 
         result = None
         cm = getattr(local_pipe, "model_cpu_offload_context", None)
-        
+
         try:
-            
             if callable(cm):
                 try:
                     with cm():
@@ -316,8 +327,8 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] =
             try:
                 for name, tok in original_tokenizers.items():
                     if name.startswith("components["):
-                        key = name[len("components["):-1]
-                        if hasattr(local_pipe, 'components') and isinstance(local_pipe.components, dict):
+                        key = name[len("components[") : -1]
+                        if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
                             local_pipe.components[key] = tok
                     else:
                         setattr(local_pipe, name, tok)
diff --git a/examples/server-async/utils/wrappers.py b/examples/server-async/utils/wrappers.py
index 5130f175c2b1..1e8474eabf3f 100644
--- a/examples/server-async/utils/wrappers.py
+++ b/examples/server-async/utils/wrappers.py
@@ -4,31 +4,38 @@ def __init__(self, tokenizer, lock):
         self._lock = lock
 
         self._thread_safe_methods = {
-            '__call__', 'encode', 'decode', 'tokenize', 
-            'encode_plus', 'batch_encode_plus', 'batch_decode'
+            "__call__",
+            "encode",
+            "decode",
+            "tokenize",
+            "encode_plus",
+            "batch_encode_plus",
+            "batch_decode",
         }
-    
+
     def __getattr__(self, name):
         attr = getattr(self._tokenizer, name)
-        
+
         if name in self._thread_safe_methods and callable(attr):
+
             def wrapped_method(*args, **kwargs):
                 with self._lock:
                     return attr(*args, **kwargs)
+
             return wrapped_method
-        
+
         return attr
 
     def __call__(self, *args, **kwargs):
         with self._lock:
             return self._tokenizer(*args, **kwargs)
-    
+
     def __setattr__(self, name, value):
-        if name.startswith('_'):
+        if name.startswith("_"):
             super().__setattr__(name, value)
         else:
             setattr(self._tokenizer, name, value)
-    
+
     def __dir__(self):
         return dir(self._tokenizer)
 
@@ -41,9 +48,11 @@ def __init__(self, vae, lock):
     def __getattr__(self, name):
         attr = getattr(self._vae, name)
         if name in {"decode", "encode", "forward"} and callable(attr):
+
             def wrapped(*args, **kwargs):
                 with self._lock:
                     return attr(*args, **kwargs)
+
             return wrapped
         return attr
 
@@ -53,6 +62,7 @@ def __setattr__(self, name, value):
         else:
             setattr(self._vae, name, value)
 
+
 class ThreadSafeImageProcessorWrapper:
     def __init__(self, proc, lock):
         self._proc = proc
@@ -61,9 +71,11 @@ def __init__(self, proc, lock):
     def __getattr__(self, name):
         attr = getattr(self._proc, name)
         if name in {"postprocess", "preprocess"} and callable(attr):
+
             def wrapped(*args, **kwargs):
                 with self._lock:
                     return attr(*args, **kwargs)
+
             return wrapped
         return attr
 
@@ -71,4 +83,4 @@ def __setattr__(self, name, value):
         if name.startswith("_"):
             super().__setattr__(name, value)
         else:
-            setattr(self._proc, name, value)
\ No newline at end of file
+            setattr(self._proc, name, value)