From bbfc5f4a9781e3491bb21083ab8c251b6e5ec382 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 6 Sep 2025 17:23:42 -0600 Subject: [PATCH 01/34] Basic implementation of request scheduling --- src/diffusers/pipelines/pipeline_utils.py | 35 +++++++++++++++++++ src/diffusers/schedulers/scheduling_amused.py | 6 ++++ .../scheduling_consistency_decoder.py | 6 ++++ .../scheduling_consistency_models.py | 6 ++++ .../scheduling_cosine_dpmsolver_multistep.py | 6 ++++ src/diffusers/schedulers/scheduling_ddim.py | 6 ++++ .../schedulers/scheduling_ddim_cogvideox.py | 6 ++++ .../schedulers/scheduling_ddim_inverse.py | 6 ++++ .../schedulers/scheduling_ddim_parallel.py | 7 ++++ src/diffusers/schedulers/scheduling_ddpm.py | 6 ++++ .../schedulers/scheduling_ddpm_parallel.py | 6 ++++ .../schedulers/scheduling_ddpm_wuerstchen.py | 6 ++++ .../schedulers/scheduling_deis_multistep.py | 7 ++++ .../schedulers/scheduling_dpm_cogvideox.py | 6 ++++ .../scheduling_dpmsolver_multistep.py | 6 ++++ .../scheduling_dpmsolver_multistep_inverse.py | 7 ++++ .../schedulers/scheduling_dpmsolver_sde.py | 6 ++++ .../scheduling_dpmsolver_singlestep.py | 6 ++++ .../scheduling_edm_dpmsolver_multistep.py | 6 ++++ .../schedulers/scheduling_edm_euler.py | 6 ++++ .../scheduling_euler_ancestral_discrete.py | 6 ++++ .../schedulers/scheduling_euler_discrete.py | 6 ++++ .../scheduling_flow_match_euler_discrete.py | 6 ++++ src/diffusers/schedulers/scheduling_sde_ve.py | 6 ++++ 24 files changed, 176 insertions(+) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 023feae4dd27..08627a172df1 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -68,6 +68,8 @@ ) from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card from ..utils.torch_utils import empty_device_cache, get_device, is_compiled_module +import copy +from types import SimpleNamespace if is_torch_npu_available(): @@ -177,6 +179,39 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) +import copy +from typing import Optional + +class RequestScopedPipeline: + def __init__(self, pipeline: "DiffusionPipeline"): + self._base = pipeline + self.unet = pipeline.unet + self.vae = pipeline.vae + self.text_encoder = getattr(pipeline, "text_encoder", None) + self.components = pipeline.components + + def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs): + base_sched = self._base.scheduler + if hasattr(base_sched, "clone_for_request"): + return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs) + return copy.deepcopy(base_sched) + + def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): + + local_scheduler = self._make_local_scheduler(num_inference_steps, device=device) + + local_pipe = copy.copy(self._base) + local_pipe.scheduler = local_scheduler + + if hasattr(local_pipe, "model_cpu_offload_context"): + cm = getattr(local_pipe, "model_cpu_offload_context") + if callable(cm): + with cm(): + return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + + return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + + class DiffusionPipeline(ConfigMixin, PushToHubMixin): r""" Base class for all pipelines. diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index 238b8d869171..ee767380e2f7 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -7,6 +7,7 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput from .scheduling_utils import SchedulerMixin +import copy def gumbel_noise(t, generator=None): @@ -160,3 +161,8 @@ def add_noise(self, sample, timesteps, generator=None): masked_sample[mask_indices] = self.config.mask_token_id return masked_sample + + def clone_for_request(self, num_inference_steps: int, temperature=(2, 0), device: Union[str, torch.device] = None): + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, temperature=temperature, device=device) + return local diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index d7af018b284a..7bf3ec6f4aeb 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -8,6 +8,7 @@ from ..utils import BaseOutput from ..utils.torch_utils import randn_tensor from .scheduling_utils import SchedulerMixin +import copy # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -109,6 +110,11 @@ def set_timesteps( self.c_out = self.c_out.to(device) self.c_in = self.c_in.to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + @property def init_noise_sigma(self): return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]] diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 0f5062258800..271369777301 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -243,6 +243,12 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Modified _convert_to_karras implementation that takes in ramp as argument def _convert_to_karras(self, ramp): """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py index 66ed296da8ea..ecda598b8ce3 100644 --- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py @@ -241,6 +241,12 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # if a noise sampler is used, reinitialise it self.noise_sampler = None + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 5ee0d084f060..9dc1006ee2a1 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -339,6 +339,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py index c19efdc7834d..3e91077b7e50 100644 --- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py @@ -302,6 +302,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 49dba840d089..fba349c8fc9f 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -286,6 +286,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 7c3f03a8dbe1..49107c9bca17 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -362,6 +362,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 0fab6d910a82..be6d7ad4880d 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -322,6 +322,12 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index ec741f9ecb7d..571aaf52bccc 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -332,6 +332,12 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 71f08277ebd7..126956204880 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -161,6 +161,12 @@ def set_timesteps( timesteps = torch.Tensor(timesteps).to(device) self.timesteps = timesteps + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index 7d8685ba10c3..13adec66870c 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -317,6 +317,13 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py index f7b63720e107..6de6d07f11c8 100644 --- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py @@ -303,6 +303,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None): lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log() lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log() diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index d07ff8b2007b..407215937fa6 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -457,6 +457,12 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 9ec958851111..fd886b48eb22 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -329,6 +329,13 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._step_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index eeb06773d977..9bba69be9e49 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -412,6 +412,12 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication self.noise_sampler = None + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def _second_order_timesteps(self, sigmas, log_sigmas): def sigma_fn(_t): return np.exp(-_t) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 8663210a6244..9d0bebe13d99 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -407,6 +407,12 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index f1b38aaff56c..105603e01f8d 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -273,6 +273,12 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index dbeff3de5652..20d3be9756dc 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -261,6 +261,12 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 9cdaa2c5e101..5713ffcfdee0 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -318,6 +318,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index f58d918dbfbe..fee2d03e5291 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -449,6 +449,12 @@ def set_timesteps( self._begin_index = None self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + def _sigma_to_t(self, sigma, log_sigmas): # get log sigma log_sigma = np.log(np.maximum(sigma, 1e-10)) diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py index 1a4f12ddfa53..258e8252f557 100644 --- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py @@ -348,6 +348,12 @@ def set_timesteps( self._step_index = None self._begin_index = None + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: schedule_timesteps = self.timesteps diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index 1bfc08cce5e9..d31c6a9430cb 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -24,6 +24,7 @@ from ..utils import BaseOutput from ..utils.torch_utils import randn_tensor from .scheduling_utils import SchedulerMixin, SchedulerOutput +import copy @dataclass @@ -122,6 +123,11 @@ def set_timesteps( self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device) + def clone_for_request(self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None): + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, sampling_eps=sampling_eps, device=device) + return local + def set_sigmas( self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None ): From a308e3ed48185ca0a6a7e5e238011ebe1a0f81ea Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 6 Sep 2025 22:04:27 -0600 Subject: [PATCH 02/34] Basic editing in SD and Flux Pipelines --- src/diffusers/pipelines/__init__.py | 1 + src/diffusers/pipelines/flux/pipeline_flux.py | 84 ++++++++++++++----- src/diffusers/pipelines/pipeline_utils.py | 49 +++++++++-- .../pipeline_stable_diffusion.py | 83 +++++++++++++----- .../pipeline_stable_diffusion_3.py | 82 +++++++++++++----- src/diffusers/schedulers/scheduling_tcd.py | 6 ++ src/diffusers/schedulers/scheduling_unclip.py | 6 ++ .../schedulers/scheduling_unipc_multistep.py | 7 ++ .../schedulers/scheduling_vq_diffusion.py | 6 ++ 9 files changed, 257 insertions(+), 67 deletions(-) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 25d5d213cf33..df9ecff685a2 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -521,6 +521,7 @@ DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin, + RequestScopedPipeline ) try: diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py index 124e611bd018..df1a4062fbea 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux.py @@ -13,8 +13,8 @@ # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union - +from typing import Any, Callable, Dict, List, Optional, Union, Tuple +import copy import numpy as np import torch from transformers import ( @@ -91,10 +91,18 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -): +) : r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -111,36 +119,72 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps class FluxPipeline( diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 08627a172df1..0a53a189745c 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -21,7 +21,8 @@ import sys from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin +from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable +import copy import numpy as np import PIL.Image @@ -179,35 +180,65 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) -import copy -from typing import Optional - class RequestScopedPipeline: - def __init__(self, pipeline: "DiffusionPipeline"): + DEFAULT_MUTABLE_ATTRS = [ + "_all_hooks", + "_offload_device", + "_progress_bar_config", + "_progress_bar", + "_rng_state", + "_last_seed", + ] + + def __init__(self, pipeline: "DiffusionPipeline", mutable_attrs: Optional[Iterable[str]] = None): self._base = pipeline self.unet = pipeline.unet self.vae = pipeline.vae self.text_encoder = getattr(pipeline, "text_encoder", None) self.components = pipeline.components + self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs): base_sched = self._base.scheduler if hasattr(base_sched, "clone_for_request"): - return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs) + try: + return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs) + except Exception as e: + logger.debug(f"clone_for_request failed: {e}, falling back to deepcopy()") return copy.deepcopy(base_sched) + def _clone_mutable_attrs(self, base, local): + for attr in self._mutable_attrs: + if hasattr(base, attr): + val = getattr(base, attr) + # safe shallow copy for common containers + if isinstance(val, dict): + setattr(local, attr, dict(val)) + elif isinstance(val, (list, tuple, set)): + setattr(local, attr, list(val)) + else: + try: + setattr(local, attr, copy.copy(val)) + except Exception: + setattr(local, attr, val) + def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): local_scheduler = self._make_local_scheduler(num_inference_steps, device=device) local_pipe = copy.copy(self._base) + local_pipe.scheduler = local_scheduler + self._clone_mutable_attrs(self._base, local_pipe) - if hasattr(local_pipe, "model_cpu_offload_context"): - cm = getattr(local_pipe, "model_cpu_offload_context") - if callable(cm): + cm = getattr(local_pipe, "model_cpu_offload_context", None) + if callable(cm): + try: with cm(): return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except TypeError: + with cm: + return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index cb97f18efeff..ebc87f30a7f3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union - +from typing import Any, Callable, Dict, List, Optional, Union, Tuple +import copy import torch from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection @@ -101,8 +101,16 @@ def retrieve_timesteps( **kwargs, ): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -119,36 +127,73 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps + class StableDiffusionPipeline( diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index 1618f89a49e3..0ee5ad4bc949 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -13,8 +13,8 @@ # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union - +from typing import Any, Callable, Dict, List, Optional, Union, Tuple +import copy import torch from transformers import ( CLIPTextModelWithProjection, @@ -95,8 +95,16 @@ def retrieve_timesteps( **kwargs, ): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -113,36 +121,72 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin): diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 3fd5c341eca9..01a47bbd52a5 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -521,6 +521,12 @@ def set_timesteps( self._step_index = None self._begin_index = None + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index d78efabfbc57..4b07949ac30f 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -177,6 +177,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = torch.from_numpy(timesteps).to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None): if prev_timestep is None: prev_timestep = t - 1 diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 162a34bd2774..b0bc1d1a8b16 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -429,6 +429,13 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) + return local + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 57306301d023..7ab4f151de65 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -197,6 +197,12 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.log_cumprod_bt = self.log_cumprod_bt.to(device) self.log_cumprod_ct = self.log_cumprod_ct.to(device) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + import copy + local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local + def step( self, model_output: torch.Tensor, From 4799b8eab2461b26ae33848bd24c738d7b402325 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 6 Sep 2025 22:40:53 -0600 Subject: [PATCH 03/34] Small Fix --- src/diffusers/pipelines/pipeline_utils.py | 160 ++++++++++++++++++---- 1 file changed, 136 insertions(+), 24 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 0a53a189745c..1bff80bd6bf7 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -188,47 +188,155 @@ class RequestScopedPipeline: "_progress_bar", "_rng_state", "_last_seed", + "latents", ] - def __init__(self, pipeline: "DiffusionPipeline", mutable_attrs: Optional[Iterable[str]] = None): + def __init__( + self, + pipeline: Any, + mutable_attrs: Optional[Iterable[str]] = None, + auto_detect_mutables: bool = True, + tensor_numel_threshold: int = 1_000_000, + ): self._base = pipeline - self.unet = pipeline.unet - self.vae = pipeline.vae + self.unet = getattr(pipeline, "unet", None) + self.vae = getattr(pipeline, "vae", None) self.text_encoder = getattr(pipeline, "text_encoder", None) - self.components = pipeline.components + self.components = getattr(pipeline, "components", None) + self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) - def _make_local_scheduler(self, num_inference_steps: int, **clone_kwargs): - base_sched = self._base.scheduler + self._auto_detect_mutables = bool(auto_detect_mutables) + self._tensor_numel_threshold = int(tensor_numel_threshold) + + self._auto_detected_attrs: List[str] = [] + + def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs): + base_sched = getattr(self._base, "scheduler", None) + if base_sched is None: + return None + if hasattr(base_sched, "clone_for_request"): try: - return base_sched.clone_for_request(num_inference_steps=num_inference_steps, **clone_kwargs) + return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs) except Exception as e: - logger.debug(f"clone_for_request failed: {e}, falling back to deepcopy()") - return copy.deepcopy(base_sched) + logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") + + try: + return copy.deepcopy(base_sched) + except Exception as e: + logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") + return base_sched + + def _autodetect_mutables(self, max_attrs: int = 40): + if not self._auto_detect_mutables: + return [] + + if self._auto_detected_attrs: + return self._auto_detected_attrs + + candidates: List[str] = [] + seen = set() + for name in dir(self._base): + if name.startswith("__"): + continue + if name in self._mutable_attrs: + continue + if name in ("to", "save_pretrained", "from_pretrained"): + continue + try: + val = getattr(self._base, name) + except Exception: + continue + + import types + + # skip callables and modules + if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)): + continue + + # containers -> candidate + if isinstance(val, (dict, list, set, tuple, bytearray)): + candidates.append(name) + seen.add(name) + else: + # try Tensor detection + try: + if isinstance(val, torch.Tensor): + if val.numel() <= self._tensor_numel_threshold: + candidates.append(name) + seen.add(name) + else: + logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}") + except Exception: + continue + + if len(candidates) >= max_attrs: + break + + self._auto_detected_attrs = candidates + logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}") + return self._auto_detected_attrs def _clone_mutable_attrs(self, base, local): - for attr in self._mutable_attrs: - if hasattr(base, attr): + attrs_to_clone = list(self._mutable_attrs) + attrs_to_clone.extend(self._autodetect_mutables()) + + for attr in attrs_to_clone: + if not hasattr(base, attr): + continue + try: val = getattr(base, attr) - # safe shallow copy for common containers - if isinstance(val, dict): + except Exception: + continue + + # shallow copy for common containers + if isinstance(val, dict): + try: setattr(local, attr, dict(val)) - elif isinstance(val, (list, tuple, set)): + except Exception: + setattr(local, attr, val) + elif isinstance(val, (list, tuple, set)): + try: setattr(local, attr, list(val)) - else: - try: - setattr(local, attr, copy.copy(val)) - except Exception: - setattr(local, attr, val) + except Exception: + setattr(local, attr, val) + elif isinstance(val, bytearray): + try: + setattr(local, attr, bytearray(val)) + except Exception: + setattr(local, attr, val) + else: + try: + if isinstance(val, torch.Tensor): + if val.numel() <= self._tensor_numel_threshold: + setattr(local, attr, val.clone()) + else: + setattr(local, attr, val) + else: + try: + setattr(local, attr, copy.copy(val)) + except Exception: + setattr(local, attr, val) + except Exception: + setattr(local, attr, val) + def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): + local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device) - local_scheduler = self._make_local_scheduler(num_inference_steps, device=device) + try: + local_pipe = copy.copy(self._base) + except Exception as e: + logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).") + local_pipe = copy.deepcopy(self._base) + if local_scheduler is not None: + try: + setattr(local_pipe, "scheduler", local_scheduler) + except Exception: + logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") - local_pipe = copy.copy(self._base) - local_pipe.scheduler = local_scheduler self._clone_mutable_attrs(self._base, local_pipe) cm = getattr(local_pipe, "model_cpu_offload_context", None) @@ -237,8 +345,12 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = with cm(): return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) except TypeError: - with cm: - return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + # puede ser que cm sea un context manager ya instanciado en vez de callable + try: + with cm: + return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except Exception as e: + logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) From eda58477cd3882ebb48b114c90851007556c10b0 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 6 Sep 2025 22:50:41 -0600 Subject: [PATCH 04/34] Fix --- src/diffusers/pipelines/pipeline_utils.py | 56 +++++++++++++++-------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 1bff80bd6bf7..e2b2d6e84ced 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -278,48 +278,68 @@ def _autodetect_mutables(self, max_attrs: int = 40): logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}") return self._auto_detected_attrs + def _is_readonly_property(self, base_obj, attr_name: str) -> bool: + try: + cls = type(base_obj) + descriptor = getattr(cls, attr_name, None) + if isinstance(descriptor, property): + return descriptor.fset is None + if hasattr(descriptor, "__set__") is False and descriptor is not None: + return False + except Exception: + pass + return False + def _clone_mutable_attrs(self, base, local): attrs_to_clone = list(self._mutable_attrs) attrs_to_clone.extend(self._autodetect_mutables()) + EXCLUDE_ATTRS = {"components",} # añade más si encuentras otros problemáticos + for attr in attrs_to_clone: + if attr in EXCLUDE_ATTRS: + logger.debug(f"Skipping excluded attr '{attr}'") + continue if not hasattr(base, attr): continue + if self._is_readonly_property(base, attr): + logger.debug(f"Skipping read-only property '{attr}'") + continue + try: val = getattr(base, attr) - except Exception: + except Exception as e: + logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}") continue - # shallow copy for common containers - if isinstance(val, dict): - try: + try: + if isinstance(val, dict): setattr(local, attr, dict(val)) - except Exception: - setattr(local, attr, val) - elif isinstance(val, (list, tuple, set)): - try: + elif isinstance(val, (list, tuple, set)): setattr(local, attr, list(val)) - except Exception: - setattr(local, attr, val) - elif isinstance(val, bytearray): - try: + elif isinstance(val, bytearray): setattr(local, attr, bytearray(val)) - except Exception: - setattr(local, attr, val) - else: - try: + else: + # small tensors or atomic values if isinstance(val, torch.Tensor): if val.numel() <= self._tensor_numel_threshold: setattr(local, attr, val.clone()) else: + # don't clone big tensors, keep reference setattr(local, attr, val) else: try: setattr(local, attr, copy.copy(val)) except Exception: + # último recurso: asignar referencia setattr(local, attr, val) - except Exception: - setattr(local, attr, val) + except (AttributeError, TypeError) as e: + logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}") + # continuar sin fallar + continue + except Exception as e: + logger.debug(f"Unexpected error cloning attribute '{attr}': {e}") + continue def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): From 6b5e6be114637340ca25effea9bfd0022e0f0ffd Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 7 Sep 2025 11:16:58 -0600 Subject: [PATCH 05/34] Update for more pipelines --- .../pipeline_stable_diffusion_xl.py | 81 ++++++++++++++----- .../pipeline_stable_diffusion_adapter.py | 80 +++++++++++++----- .../pipeline_stable_diffusion_xl_adapter.py | 80 +++++++++++++----- 3 files changed, 187 insertions(+), 54 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index b97cf6f1f6f8..81f1580fce4a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -120,10 +120,18 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -): +) : r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -140,36 +148,73 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + import copy + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps class StableDiffusionXLPipeline( diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 1ce6987114a7..63f40497afff 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -136,10 +136,18 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -): +) : r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -156,36 +164,72 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin): diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 2802d690f3cc..74a1a0bb1b22 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -161,10 +161,18 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -): +) : r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) Args: scheduler (`SchedulerMixin`): @@ -181,36 +189,72 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_timesteps = _accepts("timesteps") if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accept_sigmas = _accepts("sigmas") if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps class StableDiffusionXLAdapterPipeline( From df2933f727f2477690b443ecb6573f33502a7923 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 7 Sep 2025 12:37:56 -0600 Subject: [PATCH 06/34] Add examples/server-async --- .../server-async/DiffusersServer/Pipelines.py | 123 ++++++++ .../server-async/DiffusersServer/__init__.py | 3 + .../DiffusersServer/create_server.py | 45 +++ .../DiffusersServer/serverasync.py | 290 ++++++++++++++++++ .../DiffusersServer/superpipeline.py | 44 +++ .../DiffusersServer/uvicorn_diffu.py | 88 ++++++ examples/server-async/README.md | 118 +++++++ examples/server-async/requirements.txt | 6 + examples/server-async/server.py | 11 + examples/server-async/test.py | 60 ++++ 10 files changed, 788 insertions(+) create mode 100644 examples/server-async/DiffusersServer/Pipelines.py create mode 100644 examples/server-async/DiffusersServer/__init__.py create mode 100644 examples/server-async/DiffusersServer/create_server.py create mode 100644 examples/server-async/DiffusersServer/serverasync.py create mode 100644 examples/server-async/DiffusersServer/superpipeline.py create mode 100644 examples/server-async/DiffusersServer/uvicorn_diffu.py create mode 100644 examples/server-async/README.md create mode 100644 examples/server-async/requirements.txt create mode 100644 examples/server-async/server.py create mode 100644 examples/server-async/test.py diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py new file mode 100644 index 000000000000..648f708fd562 --- /dev/null +++ b/examples/server-async/DiffusersServer/Pipelines.py @@ -0,0 +1,123 @@ +# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/Pipelines.py + +from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline +from diffusers.pipelines.flux.pipeline_flux import FluxPipeline +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline +import torch +import os +import logging +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + +class TextToImageInput(BaseModel): + model: str + prompt: str + size: str | None = None + n: int | None = None + +class TextToImagePipelineSD3: + def __init__(self, model_path: str | None = None): + """ + Inicialización de la clase con la ruta del modelo. + Si no se proporciona, se obtiene de la variable de entorno. + """ + self.model_path = model_path or os.getenv("MODEL_PATH") + self.pipeline: StableDiffusion3Pipeline = None + self.device: str = None + + def start(self): + """ + Inicia el pipeline cargando el modelo en CUDA o MPS según esté disponible. + Se utiliza la ruta del modelo definida en el __init__ y se asigna un valor predeterminado + en función del dispositivo disponible si no se definió previamente. + """ + if torch.cuda.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para CUDA. + model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large" + logger.info("Loading CUDA") + self.device = "cuda" + self.pipeline = StableDiffusion3Pipeline.from_pretrained( + model_path, + torch_dtype=torch.float16, + ).to(device=self.device) + elif torch.backends.mps.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para MPS. + model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium" + logger.info("Loading MPS for Mac M Series") + self.device = "mps" + self.pipeline = StableDiffusion3Pipeline.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + ).to(device=self.device) + else: + raise Exception("No CUDA or MPS device available") + +class TextToImagePipelineFlux: + def __init__(self, model_path: str | None = None, low_vram: bool = False): + """ + Inicialización de la clase con la ruta del modelo. + Si no se proporciona, se obtiene de la variable de entorno. + """ + self.model_path = model_path or os.getenv("MODEL_PATH") + self.pipeline: FluxPipeline = None + self.device: str = None + self.low_vram = low_vram + + def start(self): + if torch.cuda.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para CUDA. + model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" + logger.info("Loading CUDA") + self.device = "cuda" + self.pipeline = FluxPipeline.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + ).to(device=self.device) + if self.low_vram: + self.pipeline.enable_model_cpu_offload() + else: + pass + elif torch.backends.mps.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para MPS. + model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" + logger.info("Loading MPS for Mac M Series") + self.device = "mps" + self.pipeline = FluxPipeline.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + ).to(device=self.device) + else: + raise Exception("No CUDA or MPS device available") + +class TextToImagePipelineSD: + def __init__(self, model_path: str | None = None): + """ + Inicialización de la clase con la ruta del modelo. + Si no se proporciona, se obtiene de la variable de entorno. + """ + self.model_path = model_path or os.getenv("MODEL_PATH") + self.pipeline: StableDiffusionPipeline = None + self.device: str = None + + def start(self): + if torch.cuda.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para CUDA. + model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" + logger.info("Loading CUDA") + self.device = "cuda" + self.pipeline = StableDiffusionPipeline.from_pretrained( + model_path, + torch_dtype=torch.float16, + ).to(device=self.device) + elif torch.backends.mps.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para MPS. + model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" + logger.info("Loading MPS for Mac M Series") + self.device = "mps" + self.pipeline = StableDiffusionPipeline.from_pretrained( + model_path, + torch_dtype=torch.float16, + ).to(device=self.device) + else: + raise Exception("No CUDA or MPS device available") \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py new file mode 100644 index 000000000000..d4dc75b71a1f --- /dev/null +++ b/examples/server-async/DiffusersServer/__init__.py @@ -0,0 +1,3 @@ +from .Pipelines import TextToImagePipelineSD3 +from .superpipeline import SuperPipelinesT2Img +from .create_server import create_inference_server_Async as DiffusersServerApp \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py new file mode 100644 index 000000000000..a5e6357db9d7 --- /dev/null +++ b/examples/server-async/DiffusersServer/create_server.py @@ -0,0 +1,45 @@ +# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/create_server.py + +from .Pipelines import * +from .serverasync import * +from .uvicorn_diffu import * +import asyncio + +def create_inference_server_Async( + model:str, + type_model: str = 't2im', + host: str = '0.0.0.0', + port: int = 8500, + threads=5, + enable_memory_monitor=True, + custom_model: bool = False, + custom_pipeline: Optional[Type] | None = None, + constructor_pipeline: Optional[Type] | None = None, + components: Optional[Dict[str, Any]] = None, + api_name: Optional[str] = 'custom_api', + torch_dtype = torch.bfloat16 +): + config = ServerConfigModels( + model=model, + type_models=type_model, + custom_model=custom_model, + custom_pipeline=custom_pipeline, + constructor_pipeline=constructor_pipeline, + components=components, + api_name=api_name, + torch_dtype=torch_dtype, + host=host, + port=port + ) + + app = create_app_fastapi(config) + + asyncio.run(run_uvicorn_server( + app, + host=host, + port=port, + workers=threads, + enable_memory_monitor=enable_memory_monitor + )) + + return app \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py new file mode 100644 index 000000000000..303f1aa31b3f --- /dev/null +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -0,0 +1,290 @@ +# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/serverasync.py + +from fastapi import FastAPI, HTTPException, status +from fastapi.responses import FileResponse +from fastapi.middleware.cors import CORSMiddleware +from fastapi.concurrency import run_in_threadpool +from pydantic import BaseModel +from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD +import logging +from diffusers.utils.export_utils import export_to_video +from diffusers.pipelines.pipeline_utils import RequestScopedPipeline +from diffusers import * +from .superpipeline import * +import random +import uuid +import tempfile +from dataclasses import dataclass +import os +import torch +import threading +import gc +from typing import Optional, Dict, Any, Type +from dataclasses import dataclass, field +from typing import List + +@dataclass +class PresetModels: + SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) + SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium']) + Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell']) + +class ModelPipelineInitializer: + def __init__(self, model: str = '', type_models: str = 't2im'): + self.model = model + self.type_models = type_models + self.pipeline = None + self.device = "cuda" if torch.cuda.is_available() else "mps" + self.model_type = None + + def initialize_pipeline(self): + if not self.model: + raise ValueError("Model name not provided") + + # Check if model exists in PresetModels + preset_models = PresetModels() + + # Determine which model type we're dealing with + if self.model in preset_models.SD3: + self.model_type = "SD3" + elif self.model in preset_models.SD3_5: + self.model_type = "SD3_5" + elif self.model in preset_models.Flux: + self.model_type = "Flux" + else: + self.model_type = "SD" + + # Create appropriate pipeline based on model type and type_models + if self.type_models == 't2im': + if self.model_type in ["SD3", "SD3_5"]: + self.pipeline = TextToImagePipelineSD3(self.model) + elif self.model_type == "Flux": + self.pipeline = TextToImagePipelineFlux(self.model) + elif self.model_type == "SD": + self.pipeline = TextToImagePipelineSD(self.model) + else: + raise ValueError(f"Model type {self.model_type} not supported for text-to-image") + elif self.type_models == 't2v': + raise ValueError(f"Unsupported type_models: {self.type_models}") + + return self.pipeline + +class Utils: + def __init__(self, host: str = '0.0.0.0', port: int = 8500): + self.service_url = f"http://{host}:{port}" + self.image_dir = os.path.join(tempfile.gettempdir(), "images") + if not os.path.exists(self.image_dir): + os.makedirs(self.image_dir) + + self.video_dir = os.path.join(tempfile.gettempdir(), "videos") + if not os.path.exists(self.video_dir): + os.makedirs(self.video_dir) + + def save_image(self, image): + if hasattr(image, "to"): + try: + image = image.to("cpu") + except Exception: + pass + + if isinstance(image, torch.Tensor): + from torchvision import transforms + to_pil = transforms.ToPILImage() + image = to_pil(image.squeeze(0).clamp(0, 1)) + + filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" + image_path = os.path.join(self.image_dir, filename) + logger.info(f"Saving image to {image_path}") + + image.save(image_path, format="PNG", optimize=True) + + del image + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return os.path.join(self.service_url, "images", filename) + + def save_video(self, video, fps): + filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4" + video_path = os.path.join(self.video_dir, filename) + export = export_to_video(video, video_path, fps=fps) + logger.info(f"Saving video to {video_path}") + return os.path.join(self.service_url, "video", filename) + +@dataclass +class ServerConfigModels: + model: str = 'stabilityai/stable-diffusion-3-medium' + type_models: str = 't2im' + custom_model : bool = False + constructor_pipeline: Optional[Type] = None + custom_pipeline: Optional[Type] = None + components: Optional[Dict[str, Any]] = None + api_name: Optional[str] = 'custom_api' + torch_dtype: Optional[torch.dtype] = None + host: str = '0.0.0.0' + port: int = 8500 + +def create_app_fastapi(config: ServerConfigModels) -> FastAPI: + app = FastAPI() + + class JSONBodyQueryAPI(BaseModel): + model : str | None = None + prompt : str + negative_prompt : str | None = None + num_inference_steps : int = 28 + num_images_per_prompt : int = 1 + + logging.basicConfig(level=logging.INFO) + global logger + logger = logging.getLogger(__name__) + + server_config = config or ServerConfigModels() + app.state.SERVER_CONFIG = server_config + + global utils_app + + utils_app = Utils(host=server_config.host, port=server_config.port) + + logger.info(f"Inicializando pipeline para el modelo: {server_config.model}") + try: + if server_config.custom_model: + if server_config.constructor_pipeline is None: + raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required") + initializer = server_config.constructor_pipeline( + model_path=server_config.model, + pipeline=server_config.custom_pipeline, + torch_dtype=server_config.torch_dtype, + components=server_config.components, + ) + model_pipeline = initializer.start() + app.state.CUSTOM_PIPELINE = server_config.custom_pipeline + app.state.MODEL_PIPELINE = model_pipeline + app.state.MODEL_INITIALIZER = initializer + logger.info(f"Pipeline personalizado inicializado. Tipo: {type(model_pipeline)}") + else: + initializer = ModelPipelineInitializer( + model=server_config.model, + type_models=server_config.type_models, + ) + model_pipeline = initializer.initialize_pipeline() + model_pipeline.start() + + app.state.REQUEST_PIPE = RequestScopedPipeline(model_pipeline.pipeline) + + # Lock for concurrency + pipeline_lock = threading.Lock() + + app.state.MODEL_PIPELINE = model_pipeline + app.state.PIPELINE_LOCK = pipeline_lock + app.state.MODEL_INITIALIZER = initializer + + logger.info("Pipeline initialized and ready to receive requests") + except Exception as e: + logger.error(f"Error initializing pipeline: {e}") + raise + + + @app.get("/") + async def root(): + return {"message": "Welcome to the Diffusers Server"} + + @app.post("/api/diffusers/inference") + async def api(json: JSONBodyQueryAPI): + prompt = json.prompt + negative_prompt = json.negative_prompt or "" + num_steps = json.num_inference_steps + num_images_per_prompt = json.num_images_per_prompt + + wrapper = app.state.MODEL_PIPELINE + initializer = app.state.MODEL_INITIALIZER + + + if not wrapper or not wrapper.pipeline: + raise HTTPException(500, "Modelo no inicializado correctamente") + if not prompt.strip(): + raise HTTPException(400, "No se proporcionó prompt") + + def make_generator(): + g = torch.Generator(device=initializer.device) + return g.manual_seed(random.randint(0, 10_000_000)) + + req_pipe = app.state.REQUEST_PIPE + + def infer(): + # This is called that because the RequestScoped Pipeline already internally + # handles everything necessary for inference and only the + # model pipeline needs to be passed, for example StableDiffusion3Pipeline + gen = make_generator() + return req_pipe.generate( + prompt=prompt, + negative_prompt=negative_prompt, + generator=gen, + num_inference_steps=num_steps, + num_images_per_prompt=num_images_per_prompt, + device=initializer.device + ) + + try: + output = await run_in_threadpool(infer) + + urls = [utils_app.save_image(img) for img in output.images] + return {"response": urls} + + except Exception as e: + logger.error(f"Error durante la inferencia: {e}") + raise HTTPException(500, f"Error en procesamiento: {e}") + + finally: + import gc; gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + + @app.get("/images/{filename}") + async def serve_image(filename: str): + file_path = os.path.join(utils_app.image_dir, filename) + if not os.path.isfile(file_path): + raise HTTPException(status_code=404, detail="Image not found") + return FileResponse(file_path, media_type="image/png") + + @app.get("/api/models") + async def list_models(): + return { + "current_model" : server_config.model, + "type" : server_config.type_models, + "all_models": { + "type": "T2Img", + "SD3": PresetModels().SD3, + "SD3_5": PresetModels().SD3_5, + "Flux": PresetModels().Flux, + } + } + + @app.get("/api/status") + async def get_status(): + memory_info = {} + if torch.cuda.is_available(): + memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB + memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB + memory_info = { + "memory_allocated_gb": round(memory_allocated, 2), + "memory_reserved_gb": round(memory_reserved, 2), + "device": torch.cuda.get_device_name(0) + } + + return { + "current_model" : server_config.model, + "type_models" : server_config.type_models, + "memory" : memory_info} + + + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + return app \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py new file mode 100644 index 000000000000..394ebac39011 --- /dev/null +++ b/examples/server-async/DiffusersServer/superpipeline.py @@ -0,0 +1,44 @@ +# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/superpipeline.py + +from diffusers.pipelines import * +from diffusers import * +import torch +from typing import Optional, Dict, Any, Type +import logging + +logger = logging.getLogger(__name__) + +class SuperPipelinesT2Img: + def __init__(self, model_path: str, + pipeline: Type, + torch_dtype = torch.bfloat16, + components: Optional[Dict[str, Any]] = None,): + self.model_path = model_path + self.pipeline = pipeline + self.torch_dtype = torch_dtype + self.components = components or {} + self.device: str = None + + def start(self): + if torch.cuda.is_available(): + logger.info("Loading CUDA") + model_path = self.model_path + self.device = 'cuda' + self.pipeline = self.pipeline.from_pretrained( + model_path, + torch_dtype = self.torch_dtype, + ** self.components + ).to(device=self.device) + elif torch.backends.mps.is_available(): + logger.info("Loading MPS for Mac M Series") + model_path = self.model_path + self.device = 'mps' + self.pipeline = self.pipeline.from_pretrained( + model_path, + torch_dtype = self.torch_dtype, + **self.components + ).to(device=self.device) + else: + raise Exception("No CUDA or MPS device available") + + return self \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py new file mode 100644 index 000000000000..7e19b50f3cbe --- /dev/null +++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py @@ -0,0 +1,88 @@ +# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/uvicorn_diffu.py + +import uvicorn +import logging +import gc +import psutil +import os +import threading +import time +import string + +def setup_logging(): + logging.basicConfig(level=logging.INFO) + return logging.getLogger('uvicorn') + +logger = setup_logging() + +def memory_cleanup(interval=30): + while True: + try: + + gc.collect() + + + process = psutil.Process(os.getpid()) + mem = process.memory_info().rss / 1024 / 1024 + logger.info(f"Memoria en uso: {mem:.2f} MB") + + time.sleep(interval) + except Exception as e: + logger.error(f"Error en limpieza de memoria: {str(e)}") + time.sleep(interval) + +def run_uvicorn_server( + app, + host='0.0.0.0', + port=8500, + workers=5, + cleanup_interval=30, + channel_timeout=900, + headers=[ + ("server", "DiffusersServer") + ], + enable_memory_monitor=True +): + """ + Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional + + Args: + app: Aplicación FastAPI + host (str): Host donde se servirá la aplicación + port (int): Puerto para el servidor + workers (int): Número de hilos para Uvicorn + cleanup_interval (int): Intervalo de limpieza para Uvicorn + channel_timeout (int): Tiempo de espera máximo para canales + server_header (bool): Activar el identificador / Header del servidor + headers (str): Identificador del servidor / Header del servidor + enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria + + Returns: + El resultado de serve() (aunque normalmente no retorna) + """ + gc.enable() + gc.set_threshold(700, 10, 5) + + if enable_memory_monitor: + cleanup_thread = threading.Thread( + target=memory_cleanup, + args=(cleanup_interval,), + daemon=True + ) + cleanup_thread.start() + logger.info("Monitor de memoria activado") + + logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...") + + config = uvicorn.Config( + app=app, + host=host, + workers=workers, + port=port, + timeout_keep_alive=channel_timeout, + headers=headers + ) + + server = uvicorn.Server(config) + + return server.serve() \ No newline at end of file diff --git a/examples/server-async/README.md b/examples/server-async/README.md new file mode 100644 index 000000000000..a13529b7d555 --- /dev/null +++ b/examples/server-async/README.md @@ -0,0 +1,118 @@ +# Asynchronous server and parallel execution of models + +> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork. + +## ⚠️ IMPORTANT + +* This example uses a custom Diffusers fork: `https://github.com/F4k3r22/diffusers-async`. +* The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`. + The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU. + +## Necessary components + +All the components needed to create the inference server are in `DiffusersServer/` + +``` +DiffusersServer/ # the example server package +├── __init__.py +├── create_server.py # helper script to build/run the app programmatically +├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) +├── serverasync.py # FastAPI app factory (create_app_fastapi) +├── superpipeline.py # optional custom pipeline glue code +├── uvicorn_diffu.py # convenience script to start uvicorn with recommended flags +``` + + +## What `diffusers-async` adds / Why we needed it + +Core problem: a naive server that calls `pipe.__call__` concurrently can hit **race conditions** (e.g., `scheduler.set_timesteps` mutates shared state) or explode memory by deep-copying the whole pipeline per-request. + +`diffusers-async` / this example addresses that by: + +* **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*. +* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics. +* **`retrieve_timesteps(..., return_scheduler=True)`**: retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. This is the safe path for getting a scheduler configured per-request. +* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. + +## How the server works (high-level flow) + +1. **Single model instance** is loaded into memory (GPU/MPS) when the server starts. +2. On each HTTP inference request: + + * The server uses `RequestScopedPipeline.generate(...)` which: + + * obtains a *local scheduler* (via `clone_for_request()` or `deepcopy`), + * does `local_pipe = copy.copy(base_pipe)` (shallow copy), + * sets `local_pipe.scheduler = local_scheduler` (if possible), + * clones only small mutable attributes (callbacks, rng, small latents), + * optionally enters a `model_cpu_offload_context()` for memory offload hooks, + * calls the pipeline on the local view (`local_pipe(...)`). +3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`). +4. Multiple requests can run in parallel while sharing heavy weights and isolating mutable state. + + +## How to set up and run the server + +### 1) Install dependencies + +Recommended: create a virtualenv / conda environment. + +If using the `diffusers` fork via git, either: + +**A) Preinstall the fork first (if you want to avoid hatch direct references):** + +```bash +pip install "git+https://github.com/F4k3r22/diffusers-async.git@main" +pip install -r requirements.txt +``` + +### 2) Start the server + +Using the `server.py` file that already has everything you need: + +```bash +python server.py +``` + +### 3) Example request + +`POST /api/diffusers/inference` with JSON body: + +```json +{ + "prompt": "A futuristic cityscape, vibrant colors", + "num_inference_steps": 30, + "num_images_per_prompt": 1 +} +``` + +Response example: + +```json +{ + "response": ["http://localhost:8500/images/img123.png"] +} +``` + +## Troubleshooting (quick) + +* `Already borrowed` — tokenizers (Rust) error when used concurrently. + + * Workarounds: + + * Acquire a `Lock` around tokenization or around the pipeline call (serializes that part). + * Use the slow tokenizer (`converter_to_slow`) for concurrency tests. + * Patch only the tokenization method to use a lock instead of serializing entire forward. +* `can't set attribute 'components'` — pipeline exposes read-only `components`. + + * The RequestScopedPipeline now detects read-only properties and skips setting them. +* Scheduler issues: + + * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler. + + +## Integration notes / performance tips + +* **Compile UNet**: try `pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")` — measure before enabling compile widely. If compile fails, compile only the UNet or use `mode="reduce-overhead"`. +* **Offload**: use `pipeline.enable_model_cpu_offload()` where appropriate to reduce peak GPU memory. +* **Quantization**: bitsandbytes quantization reduces memory but may require extra torch.compile flags (e.g., `torch._dynamo.config.capture_dynamic_output_shape_ops = True`). diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt new file mode 100644 index 000000000000..50eeed9b2f9e --- /dev/null +++ b/examples/server-async/requirements.txt @@ -0,0 +1,6 @@ +torch +torchvision +transformers +sentencepiece +fastapi +uvicorn \ No newline at end of file diff --git a/examples/server-async/server.py b/examples/server-async/server.py new file mode 100644 index 000000000000..590522038a53 --- /dev/null +++ b/examples/server-async/server.py @@ -0,0 +1,11 @@ +# DiffusersServerApp already handles the inference server and everything else internally, you +# just need to do these basic configurations and run the script with "python server.py" +# and you already get access to the inference APIs. +from DiffusersServer import DiffusersServerApp + +app = DiffusersServerApp( + model='stabilityai/stable-diffusion-3.5-medium', + type_model='t2im', + threads=3, + enable_memory_monitor=True +) \ No newline at end of file diff --git a/examples/server-async/test.py b/examples/server-async/test.py new file mode 100644 index 000000000000..2a68c77bb28f --- /dev/null +++ b/examples/server-async/test.py @@ -0,0 +1,60 @@ +import os +import time +import urllib.parse +import requests + +SERVER_URL = "http://localhost:8500/api/diffusers/inference" +BASE_URL = "http://localhost:8500" +DOWNLOAD_FOLDER = "imagenes_generadas" +WAIT_BEFORE_DOWNLOAD = 2 # seconds + +os.makedirs(DOWNLOAD_FOLDER, exist_ok=True) + +def save_from_url(url: str) -> str: + """Download the given URL (relative or absolute) and save it locally.""" + if url.startswith("/"): + direct = BASE_URL.rstrip("/") + url + else: + direct = url + resp = requests.get(direct, timeout=60) + resp.raise_for_status() + filename = os.path.basename(urllib.parse.urlparse(direct).path) or f"img_{int(time.time())}.png" + path = os.path.join(DOWNLOAD_FOLDER, filename) + with open(path, "wb") as f: + f.write(resp.content) + return path + +def main(): + payload = { + "prompt": "The T-800 Terminator Robot Returning From The Future, Anime Style", + "num_inference_steps": 30, + "num_images_per_prompt": 1 + } + + print("Sending request...") + try: + r = requests.post(SERVER_URL, json=payload, timeout=480) + r.raise_for_status() + except Exception as e: + print(f"Request failed: {e}") + return + + body = r.json().get("response", []) + # Normalize to a list + urls = body if isinstance(body, list) else [body] if body else [] + if not urls: + print("No URLs found in the response. Check the server output.") + return + + print(f"Received {len(urls)} URL(s). Waiting {WAIT_BEFORE_DOWNLOAD}s before downloading...") + time.sleep(WAIT_BEFORE_DOWNLOAD) + + for u in urls: + try: + path = save_from_url(u) + print(f"Image saved to: {path}") + except Exception as e: + print(f"Error downloading {u}: {e}") + +if __name__ == "__main__": + main() From 5c7c7c6077181306b813e82228b74afc2f30ca32 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 7 Sep 2025 12:43:29 -0600 Subject: [PATCH 07/34] Add examples/server-async --- examples/server-async/README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index a13529b7d555..10b4c1825098 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -59,7 +59,7 @@ Recommended: create a virtualenv / conda environment. If using the `diffusers` fork via git, either: -**A) Preinstall the fork first (if you want to avoid hatch direct references):** +**A) Preinstall the fork first:** ```bash pip install "git+https://github.com/F4k3r22/diffusers-async.git@main" @@ -110,9 +110,3 @@ Response example: * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler. - -## Integration notes / performance tips - -* **Compile UNet**: try `pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")` — measure before enabling compile widely. If compile fails, compile only the UNet or use `mode="reduce-overhead"`. -* **Offload**: use `pipeline.enable_model_cpu_offload()` where appropriate to reduce peak GPU memory. -* **Quantization**: bitsandbytes quantization reduces memory but may require extra torch.compile flags (e.g., `torch._dynamo.config.capture_dynamic_output_shape_ops = True`). From bd3e48a2af68e104840c7137cf755ad687920e68 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 11:50:09 -0600 Subject: [PATCH 08/34] Updated RequestScopedPipeline to handle a single tokenizer lock to avoid race conditions --- src/diffusers/pipelines/pipeline_utils.py | 137 ++++++++++++++++++++-- 1 file changed, 124 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index b05cf71568b1..42f70e6a7330 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -23,6 +23,7 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable import copy +import threading import numpy as np import PIL.Image @@ -180,6 +181,34 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + +class _TokenizerLockWrapper: + def __init__(self, tokenizer: Any, lock: threading.Lock): + self._tokenizer = tokenizer + self._lock = lock + + def __call__(self, *args, **kwargs): + with self._lock: + return self._tokenizer(*args, **kwargs) + + # common tokenizer methods some codepaths call + def encode(self, *args, **kwargs): + with self._lock: + return getattr(self._tokenizer, "encode")(*args, **kwargs) + + def batch_encode_plus(self, *args, **kwargs): + with self._lock: + return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs) + + def encode_plus(self, *args, **kwargs): + with self._lock: + return getattr(self._tokenizer, "encode_plus")(*args, **kwargs) + + # fallback: delegate any other attribute access to the original tokenizer + def __getattr__(self, name): + return getattr(self._tokenizer, name) + + class RequestScopedPipeline: DEFAULT_MUTABLE_ATTRS = [ "_all_hooks", @@ -197,6 +226,7 @@ def __init__( mutable_attrs: Optional[Iterable[str]] = None, auto_detect_mutables: bool = True, tensor_numel_threshold: int = 1_000_000, + tokenizer_lock: Optional[threading.Lock] = None ): self._base = pipeline self.unet = getattr(pipeline, "unet", None) @@ -205,6 +235,7 @@ def __init__( self.components = getattr(pipeline, "components", None) self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) + self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() self._auto_detect_mutables = bool(auto_detect_mutables) self._tensor_numel_threshold = int(tensor_numel_threshold) @@ -294,7 +325,7 @@ def _clone_mutable_attrs(self, base, local): attrs_to_clone = list(self._mutable_attrs) attrs_to_clone.extend(self._autodetect_mutables()) - EXCLUDE_ATTRS = {"components",} # añade más si encuentras otros problemáticos + EXCLUDE_ATTRS = {"components",} for attr in attrs_to_clone: if attr in EXCLUDE_ATTRS: @@ -350,29 +381,109 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = except Exception as e: logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).") local_pipe = copy.deepcopy(self._base) + if local_scheduler is not None: try: setattr(local_pipe, "scheduler", local_scheduler) except Exception: logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") - self._clone_mutable_attrs(self._base, local_pipe) + # 4) wrap tokenizers on the local pipe with the lock wrapper + wrapped_tokenizers = {} # name -> original_tokenizer + try: + # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...) + for name in dir(local_pipe): + if "tokenizer" in name and not name.startswith("_"): + try: + tok = getattr(local_pipe, name, None) + if tok is None: + continue + # avoid double-wrapping + if isinstance(tok, _TokenizerLockWrapper): + continue + # perform wrap + originals_tok = tok + try: + setattr(local_pipe, name, _TokenizerLockWrapper(originals_tok, self._tokenizer_lock)) + wrapped_tokenizers[name] = originals_tok + except Exception: + logger.debug(f"Failed to wrap tokenizer attribute '{name}' with lock.") + except Exception: + # ignore attribute access errors + continue + + # b) also check components mapping if present (common pattern) + comps = getattr(local_pipe, "components", None) + if isinstance(comps, dict): + for key, val in list(comps.items()): + # only handle values that look like tokenizers + if key and "tokenizer" in str(key).lower(): + try: + if isinstance(val, _TokenizerLockWrapper): + continue + wrapped_name = f"components[{key}]" + local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock) + wrapped_tokenizers[wrapped_name] = val + except Exception: + logger.debug(f"Failed to wrap components['{key}'] tokenizer with lock.") + else: + # sometimes tokenizers are stored as values with names that include 'tokenizer' + try: + if hasattr(val, "__class__") and "tokenizer" in val.__class__.__name__.lower(): + wrapped_name = f"components[{key}]" + if isinstance(val, _TokenizerLockWrapper): + continue + local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock) + wrapped_tokenizers[wrapped_name] = val + except Exception: + continue + + except Exception as e: + logger.debug(f"Tokenizer wrapping step encountered an error: {e}") + + # 5) run the pipeline, trying model_cpu_offload_context if available + result = None cm = getattr(local_pipe, "model_cpu_offload_context", None) - if callable(cm): - try: - with cm(): - return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - except TypeError: - # puede ser que cm sea un context manager ya instanciado en vez de callable + try: + if callable(cm): try: - with cm: - return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - except Exception as e: - logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") + with cm(): + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except TypeError: + # cm might be a context manager instance rather than callable + try: + with cm: + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except Exception as e: + logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + else: + # no offload context available — call directly + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - return local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + return result + + finally: + # 6) restore any wrapped tokenizers on local_pipe (best-effort, local_pipe will be GC'd) + try: + # restore direct attrs + for name, orig in list(wrapped_tokenizers.items()): + if name.startswith("components["): + # components entry + key = name[len("components["):-1] + try: + local_pipe.components[key] = orig + except Exception: + pass + else: + try: + setattr(local_pipe, name, orig) + except Exception: + pass + except Exception as e: + logger.debug(f"Error restoring wrapped tokenizers: {e}") class DiffusionPipeline(ConfigMixin, PushToHubMixin): From 534710c854726db50489bfb39846c66819d4c5e0 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 15:49:41 -0600 Subject: [PATCH 09/34] Fix --- src/diffusers/pipelines/pipeline_utils.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 42f70e6a7330..8eb9057a9fb2 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -191,22 +191,26 @@ def __call__(self, *args, **kwargs): with self._lock: return self._tokenizer(*args, **kwargs) - # common tokenizer methods some codepaths call + def __getattr__(self, name): + return getattr(self._tokenizer, name) + + def __len__(self): + return len(self._tokenizer) + + def __getitem__(self, item): + return self._tokenizer[item] + def encode(self, *args, **kwargs): with self._lock: - return getattr(self._tokenizer, "encode")(*args, **kwargs) + return self._tokenizer.encode(*args, **kwargs) def batch_encode_plus(self, *args, **kwargs): with self._lock: - return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs) + return self._tokenizer.batch_encode_plus(*args, **kwargs) def encode_plus(self, *args, **kwargs): with self._lock: - return getattr(self._tokenizer, "encode_plus")(*args, **kwargs) - - # fallback: delegate any other attribute access to the original tokenizer - def __getattr__(self, name): - return getattr(self._tokenizer, name) + return self._tokenizer.encode_plus(*args, **kwargs) class RequestScopedPipeline: From 4d7c64feb65b9d97be016f57730bf5f7d319a15a Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 15:57:57 -0600 Subject: [PATCH 10/34] Fix _TokenizerLockWrapper --- src/diffusers/pipelines/pipeline_utils.py | 102 +++++++++++++++++++--- 1 file changed, 90 insertions(+), 12 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 8eb9057a9fb2..6f9f2249d045 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -187,30 +187,108 @@ def __init__(self, tokenizer: Any, lock: threading.Lock): self._tokenizer = tokenizer self._lock = lock + # --- callables that must be protected by the lock --- def __call__(self, *args, **kwargs): with self._lock: return self._tokenizer(*args, **kwargs) - def __getattr__(self, name): - return getattr(self._tokenizer, name) - - def __len__(self): - return len(self._tokenizer) - - def __getitem__(self, item): - return self._tokenizer[item] - def encode(self, *args, **kwargs): with self._lock: - return self._tokenizer.encode(*args, **kwargs) + return getattr(self._tokenizer, "encode")(*args, **kwargs) def batch_encode_plus(self, *args, **kwargs): with self._lock: - return self._tokenizer.batch_encode_plus(*args, **kwargs) + return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs) def encode_plus(self, *args, **kwargs): with self._lock: - return self._tokenizer.encode_plus(*args, **kwargs) + return getattr(self._tokenizer, "encode_plus")(*args, **kwargs) + + # --- attribute delegation for everything else --- + def __getattr__(self, name): + # Called only if attribute is not found on this wrapper; + # delegate to the real tokenizer + return getattr(self._tokenizer, name) + + def __repr__(self): + return f"" + + def __str__(self): + return str(self._tokenizer) + + def __len__(self): + try: + return len(self._tokenizer) + except Exception: + return 0 + + def __iter__(self): + return iter(self._tokenizer) + + def __contains__(self, item): + try: + return item in self._tokenizer + except Exception: + return False + + def __getitem__(self, key): + return self._tokenizer[key] + + # --- numeric / comparison support (crucial to fix your TypeError) --- + def _as_int(self) -> int: + """ + Best-effort integer representation for comparisons: + prefer vocab_size, then model_max_length-like attributes, then len(tokenizer), else 0. + """ + for attr in ("vocab_size", "vocab_size_base", "model_max_length", "max_len_single_sentence", "max_len"): + val = getattr(self._tokenizer, attr, None) + if isinstance(val, int): + return val + try: + return int(len(self._tokenizer)) + except Exception: + return 0 + + def __int__(self): + return self._as_int() + + def __index__(self): + return self._as_int() + + # rich comparisons - delegate to integer representation when compared with numbers + def __lt__(self, other): + try: + return self._as_int() < int(other) + except Exception: + return NotImplemented + + def __le__(self, other): + try: + return self._as_int() <= int(other) + except Exception: + return NotImplemented + + def __gt__(self, other): + try: + return self._as_int() > int(other) + except Exception: + return NotImplemented + + def __ge__(self, other): + try: + return self._as_int() >= int(other) + except Exception: + return NotImplemented + + def __eq__(self, other): + # equality: unwrap if other is also wrapper + if isinstance(other, _TokenizerLockWrapper): + return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other._tokenizer) + return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other) + + def __ne__(self, other): + return not self.__eq__(other) + class RequestScopedPipeline: From 18db9e6ff7e9199f9c1f93e5054257cea27884ec Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 16:02:14 -0600 Subject: [PATCH 11/34] Fix _TokenizerLockWrapper --- src/diffusers/pipelines/pipeline_utils.py | 86 +++-------------------- 1 file changed, 10 insertions(+), 76 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 6f9f2249d045..6de9fbb14380 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -183,11 +183,10 @@ def __init__(self, *args, **kwargs): class _TokenizerLockWrapper: - def __init__(self, tokenizer: Any, lock: threading.Lock): + def __init__(self, tokenizer, lock): self._tokenizer = tokenizer self._lock = lock - # --- callables that must be protected by the lock --- def __call__(self, *args, **kwargs): with self._lock: return self._tokenizer(*args, **kwargs) @@ -204,90 +203,25 @@ def encode_plus(self, *args, **kwargs): with self._lock: return getattr(self._tokenizer, "encode_plus")(*args, **kwargs) - # --- attribute delegation for everything else --- def __getattr__(self, name): - # Called only if attribute is not found on this wrapper; - # delegate to the real tokenizer return getattr(self._tokenizer, name) - def __repr__(self): - return f"" - - def __str__(self): - return str(self._tokenizer) - - def __len__(self): - try: - return len(self._tokenizer) - except Exception: - return 0 - - def __iter__(self): - return iter(self._tokenizer) - - def __contains__(self, item): - try: - return item in self._tokenizer - except Exception: - return False - - def __getitem__(self, key): - return self._tokenizer[key] - - # --- numeric / comparison support (crucial to fix your TypeError) --- - def _as_int(self) -> int: - """ - Best-effort integer representation for comparisons: - prefer vocab_size, then model_max_length-like attributes, then len(tokenizer), else 0. - """ - for attr in ("vocab_size", "vocab_size_base", "model_max_length", "max_len_single_sentence", "max_len"): - val = getattr(self._tokenizer, attr, None) - if isinstance(val, int): - return val - try: - return int(len(self._tokenizer)) - except Exception: - return 0 - def __int__(self): - return self._as_int() - - def __index__(self): - return self._as_int() + return getattr(self._tokenizer, "vocab_size", 0) - # rich comparisons - delegate to integer representation when compared with numbers def __lt__(self, other): - try: - return self._as_int() < int(other) - except Exception: - return NotImplemented - + try: return int(self) < int(other) + except Exception: return NotImplemented def __le__(self, other): - try: - return self._as_int() <= int(other) - except Exception: - return NotImplemented - + try: return int(self) <= int(other) + except Exception: return NotImplemented def __gt__(self, other): - try: - return self._as_int() > int(other) - except Exception: - return NotImplemented - + try: return int(self) > int(other) + except Exception: return NotImplemented def __ge__(self, other): - try: - return self._as_int() >= int(other) - except Exception: - return NotImplemented - - def __eq__(self, other): - # equality: unwrap if other is also wrapper - if isinstance(other, _TokenizerLockWrapper): - return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other._tokenizer) - return getattr(self._tokenizer, "__eq__", lambda o: self._tokenizer == o)(other) + try: return int(self) >= int(other) + except Exception: return NotImplemented - def __ne__(self, other): - return not self.__eq__(other) From 8f0efb1a456ba5686cedaf212bb6520b37770367 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 16:18:09 -0600 Subject: [PATCH 12/34] Delete _TokenizerLockWrapper --- src/diffusers/pipelines/pipeline_utils.py | 120 ++++------------------ 1 file changed, 22 insertions(+), 98 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 6de9fbb14380..45339e833c78 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -181,48 +181,9 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - -class _TokenizerLockWrapper: - def __init__(self, tokenizer, lock): - self._tokenizer = tokenizer - self._lock = lock - - def __call__(self, *args, **kwargs): - with self._lock: - return self._tokenizer(*args, **kwargs) - - def encode(self, *args, **kwargs): - with self._lock: - return getattr(self._tokenizer, "encode")(*args, **kwargs) - - def batch_encode_plus(self, *args, **kwargs): - with self._lock: - return getattr(self._tokenizer, "batch_encode_plus")(*args, **kwargs) - - def encode_plus(self, *args, **kwargs): - with self._lock: - return getattr(self._tokenizer, "encode_plus")(*args, **kwargs) - - def __getattr__(self, name): - return getattr(self._tokenizer, name) - - def __int__(self): - return getattr(self._tokenizer, "vocab_size", 0) - - def __lt__(self, other): - try: return int(self) < int(other) - except Exception: return NotImplemented - def __le__(self, other): - try: return int(self) <= int(other) - except Exception: return NotImplemented - def __gt__(self, other): - try: return int(self) > int(other) - except Exception: return NotImplemented - def __ge__(self, other): - try: return int(self) >= int(other) - except Exception: return NotImplemented - - +def safe_tokenize(tokenizer, *args, lock, **kwargs): + with lock: + return tokenizer(*args, **kwargs) class RequestScopedPipeline: @@ -407,59 +368,31 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = self._clone_mutable_attrs(self._base, local_pipe) # 4) wrap tokenizers on the local pipe with the lock wrapper - wrapped_tokenizers = {} # name -> original_tokenizer + tokenizer_wrappers = {} # name -> original_tokenizer try: # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...) for name in dir(local_pipe): if "tokenizer" in name and not name.startswith("_"): - try: - tok = getattr(local_pipe, name, None) - if tok is None: - continue - # avoid double-wrapping - if isinstance(tok, _TokenizerLockWrapper): - continue - # perform wrap - originals_tok = tok - try: - setattr(local_pipe, name, _TokenizerLockWrapper(originals_tok, self._tokenizer_lock)) - wrapped_tokenizers[name] = originals_tok - except Exception: - logger.debug(f"Failed to wrap tokenizer attribute '{name}' with lock.") - except Exception: - # ignore attribute access errors + tok = getattr(local_pipe, name, None) + if tok is not None: + tokenizer_wrappers[name] = tok + setattr( + local_pipe, + name, + lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) + ) + + if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): + for key, val in local_pipe.components.items(): + if val is None: continue - - # b) also check components mapping if present (common pattern) - comps = getattr(local_pipe, "components", None) - if isinstance(comps, dict): - for key, val in list(comps.items()): - # only handle values that look like tokenizers - if key and "tokenizer" in str(key).lower(): - try: - if isinstance(val, _TokenizerLockWrapper): - continue - wrapped_name = f"components[{key}]" - local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock) - wrapped_tokenizers[wrapped_name] = val - except Exception: - logger.debug(f"Failed to wrap components['{key}'] tokenizer with lock.") - else: - # sometimes tokenizers are stored as values with names that include 'tokenizer' - try: - if hasattr(val, "__class__") and "tokenizer" in val.__class__.__name__.lower(): - wrapped_name = f"components[{key}]" - if isinstance(val, _TokenizerLockWrapper): - continue - local_pipe.components[key] = _TokenizerLockWrapper(val, self._tokenizer_lock) - wrapped_tokenizers[wrapped_name] = val - except Exception: - continue + if "tokenizer" in str(key).lower() or "tokenizer" in val.__class__.__name__.lower(): + tokenizer_wrappers[f"components[{key}]"] = val + local_pipe.components[key] = lambda *args, tok=val, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) except Exception as e: logger.debug(f"Tokenizer wrapping step encountered an error: {e}") - # 5) run the pipeline, trying model_cpu_offload_context if available result = None cm = getattr(local_pipe, "model_cpu_offload_context", None) try: @@ -482,22 +415,13 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = return result finally: - # 6) restore any wrapped tokenizers on local_pipe (best-effort, local_pipe will be GC'd) try: - # restore direct attrs - for name, orig in list(wrapped_tokenizers.items()): + for name, tok in tokenizer_wrappers.items(): if name.startswith("components["): - # components entry key = name[len("components["):-1] - try: - local_pipe.components[key] = orig - except Exception: - pass + local_pipe.components[key] = tok else: - try: - setattr(local_pipe, name, orig) - except Exception: - pass + setattr(local_pipe, name, tok) except Exception as e: logger.debug(f"Error restoring wrapped tokenizers: {e}") From b47903911e00121f895460dcaa250ba018cca842 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 16:26:18 -0600 Subject: [PATCH 13/34] Fix tokenizer --- src/diffusers/pipelines/pipeline_utils.py | 41 ++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 45339e833c78..a36f8c803d3d 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -349,6 +349,24 @@ def _clone_mutable_attrs(self, base, local): logger.debug(f"Unexpected error cloning attribute '{attr}': {e}") continue + def _is_tokenizer_component(self, component) -> bool: + """Determina si un componente es un tokenizador basándose en métodos y atributos comunes.""" + if component is None: + return False + + # Verificar métodos comunes de tokenizadores + tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] + has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) + + # Verificar nombre de clase + class_name = component.__class__.__name__.lower() + has_tokenizer_in_name = 'tokenizer' in class_name + + # Verificar atributos comunes de tokenizadores + tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] + has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) + + return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device) @@ -374,21 +392,27 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = for name in dir(local_pipe): if "tokenizer" in name and not name.startswith("_"): tok = getattr(local_pipe, name, None) - if tok is not None: + if tok is not None and self._is_tokenizer_component(tok): tokenizer_wrappers[name] = tok setattr( - local_pipe, - name, - lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) + local_pipe, + name, + lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) ) + # b) wrap tokenizers in components dict - CORRECCIÓN CRÍTICA if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): for key, val in local_pipe.components.items(): if val is None: continue - if "tokenizer" in str(key).lower() or "tokenizer" in val.__class__.__name__.lower(): + + # Solo envolver si realmente ES un tokenizador + if self._is_tokenizer_component(val): tokenizer_wrappers[f"components[{key}]"] = val - local_pipe.components[key] = lambda *args, tok=val, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) + # Crear una nueva función lambda que capture correctamente 'val' + local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize( + tokenizer, *args, lock=self._tokenizer_lock, **kwargs + ) except Exception as e: logger.debug(f"Tokenizer wrapping step encountered an error: {e}") @@ -409,13 +433,14 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) else: - # no offload context available — call directly + # no offload context available — call directly result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) return result finally: try: + # Restaurar los tokenizadores originales for name, tok in tokenizer_wrappers.items(): if name.startswith("components["): key = name[len("components["):-1] @@ -425,7 +450,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = except Exception as e: logger.debug(f"Error restoring wrapped tokenizers: {e}") - + class DiffusionPipeline(ConfigMixin, PushToHubMixin): r""" Base class for all pipelines. From 0beab1cf7841723e36b8b982cfb509d60b659b8c Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 10 Sep 2025 20:59:05 -0600 Subject: [PATCH 14/34] Update examples/server-async --- .../server-async/DiffusersServer/Pipelines.py | 25 +-- .../DiffusersServer/create_server.py | 2 +- .../DiffusersServer/serverasync.py | 179 ++++++++++++------ .../DiffusersServer/superpipeline.py | 2 - .../DiffusersServer/uvicorn_diffu.py | 26 +-- examples/server-async/requirements.txt | 3 +- 6 files changed, 123 insertions(+), 114 deletions(-) diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py index 648f708fd562..66391b89560a 100644 --- a/examples/server-async/DiffusersServer/Pipelines.py +++ b/examples/server-async/DiffusersServer/Pipelines.py @@ -1,4 +1,4 @@ -# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/Pipelines.py +# Pipelines.py from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline from diffusers.pipelines.flux.pipeline_flux import FluxPipeline @@ -18,22 +18,12 @@ class TextToImageInput(BaseModel): class TextToImagePipelineSD3: def __init__(self, model_path: str | None = None): - """ - Inicialización de la clase con la ruta del modelo. - Si no se proporciona, se obtiene de la variable de entorno. - """ self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: StableDiffusion3Pipeline = None self.device: str = None def start(self): - """ - Inicia el pipeline cargando el modelo en CUDA o MPS según esté disponible. - Se utiliza la ruta del modelo definida en el __init__ y se asigna un valor predeterminado - en función del dispositivo disponible si no se definió previamente. - """ if torch.cuda.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large" logger.info("Loading CUDA") self.device = "cuda" @@ -42,7 +32,6 @@ def start(self): torch_dtype=torch.float16, ).to(device=self.device) elif torch.backends.mps.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium" logger.info("Loading MPS for Mac M Series") self.device = "mps" @@ -55,10 +44,6 @@ def start(self): class TextToImagePipelineFlux: def __init__(self, model_path: str | None = None, low_vram: bool = False): - """ - Inicialización de la clase con la ruta del modelo. - Si no se proporciona, se obtiene de la variable de entorno. - """ self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: FluxPipeline = None self.device: str = None @@ -66,7 +51,6 @@ def __init__(self, model_path: str | None = None, low_vram: bool = False): def start(self): if torch.cuda.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading CUDA") self.device = "cuda" @@ -79,7 +63,6 @@ def start(self): else: pass elif torch.backends.mps.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading MPS for Mac M Series") self.device = "mps" @@ -92,17 +75,12 @@ def start(self): class TextToImagePipelineSD: def __init__(self, model_path: str | None = None): - """ - Inicialización de la clase con la ruta del modelo. - Si no se proporciona, se obtiene de la variable de entorno. - """ self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: StableDiffusionPipeline = None self.device: str = None def start(self): if torch.cuda.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading CUDA") self.device = "cuda" @@ -111,7 +89,6 @@ def start(self): torch_dtype=torch.float16, ).to(device=self.device) elif torch.backends.mps.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading MPS for Mac M Series") self.device = "mps" diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py index a5e6357db9d7..7ccfd9c742f8 100644 --- a/examples/server-async/DiffusersServer/create_server.py +++ b/examples/server-async/DiffusersServer/create_server.py @@ -1,4 +1,4 @@ -# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/create_server.py +# create_server.py from .Pipelines import * from .serverasync import * diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index 303f1aa31b3f..78e1d44f4119 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -1,6 +1,4 @@ -# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/serverasync.py - -from fastapi import FastAPI, HTTPException, status +from fastapi import FastAPI, HTTPException, Request from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware from fastapi.concurrency import run_in_threadpool @@ -22,6 +20,8 @@ from typing import Optional, Dict, Any, Type from dataclasses import dataclass, field from typing import List +from contextlib import asynccontextmanager +import asyncio @dataclass class PresetModels: @@ -114,11 +114,11 @@ def save_video(self, video, fps): @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' + model: str = 'stabilityai/stable-diffusion-3-medium' type_models: str = 't2im' custom_model : bool = False constructor_pipeline: Optional[Type] = None - custom_pipeline: Optional[Type] = None + custom_pipeline: Optional[Type] = None components: Optional[Dict[str, Any]] = None api_name: Optional[str] = 'custom_api' torch_dtype: Optional[torch.dtype] = None @@ -126,7 +126,96 @@ class ServerConfigModels: port: int = 8500 def create_app_fastapi(config: ServerConfigModels) -> FastAPI: - app = FastAPI() + + server_config = config or ServerConfigModels() + + @asynccontextmanager + async def lifespan(app: FastAPI): + logging.basicConfig(level=logging.INFO) + app.state.logger = logging.getLogger("diffusers-server") + + app.state.total_requests = 0 + app.state.active_inferences = 0 + app.state.metrics_lock = asyncio.Lock() + app.state.metrics_task = None + + app.state.utils_app = Utils( + host=server_config.host, + port=server_config.port, + ) + + async def metrics_loop(): + try: + while True: + async with app.state.metrics_lock: + total = app.state.total_requests + active = app.state.active_inferences + app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}") + await asyncio.sleep(5) + except asyncio.CancelledError: + app.state.logger.info("Metrics loop cancelled") + raise + + app.state.metrics_task = asyncio.create_task(metrics_loop()) + + try: + yield + finally: + # 🔻 shutdown + task = app.state.metrics_task + if task: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + try: + stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None) + if callable(stop_fn): + await run_in_threadpool(stop_fn) + except Exception as e: + app.state.logger.warning(f"Error during pipeline shutdown: {e}") + + app.state.logger.info("Lifespan shutdown complete") + + + + app = FastAPI(lifespan=lifespan) + + logger = logging.getLogger("DiffusersServer.Pipelines") + + if server_config.custom_model: + if server_config.constructor_pipeline is None: + raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required") + + initializer = server_config.constructor_pipeline( + model_path=server_config.model, + pipeline=server_config.custom_pipeline, + torch_dtype=server_config.torch_dtype, + components=server_config.components, + ) + model_pipeline = initializer.start() + request_pipe = None + pipeline_lock = threading.Lock() + + else: + initializer = ModelPipelineInitializer( + model=server_config.model, + type_models=server_config.type_models, + ) + model_pipeline = initializer.initialize_pipeline() + model_pipeline.start() + + request_pipe = RequestScopedPipeline(model_pipeline.pipeline) + pipeline_lock = threading.Lock() + + logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})") + + app.state.MODEL_INITIALIZER = initializer + app.state.MODEL_PIPELINE = model_pipeline + app.state.REQUEST_PIPE = request_pipe + app.state.PIPELINE_LOCK = pipeline_lock class JSONBodyQueryAPI(BaseModel): model : str | None = None @@ -135,54 +224,12 @@ class JSONBodyQueryAPI(BaseModel): num_inference_steps : int = 28 num_images_per_prompt : int = 1 - logging.basicConfig(level=logging.INFO) - global logger - logger = logging.getLogger(__name__) - - server_config = config or ServerConfigModels() - app.state.SERVER_CONFIG = server_config - - global utils_app - - utils_app = Utils(host=server_config.host, port=server_config.port) - - logger.info(f"Inicializando pipeline para el modelo: {server_config.model}") - try: - if server_config.custom_model: - if server_config.constructor_pipeline is None: - raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required") - initializer = server_config.constructor_pipeline( - model_path=server_config.model, - pipeline=server_config.custom_pipeline, - torch_dtype=server_config.torch_dtype, - components=server_config.components, - ) - model_pipeline = initializer.start() - app.state.CUSTOM_PIPELINE = server_config.custom_pipeline - app.state.MODEL_PIPELINE = model_pipeline - app.state.MODEL_INITIALIZER = initializer - logger.info(f"Pipeline personalizado inicializado. Tipo: {type(model_pipeline)}") - else: - initializer = ModelPipelineInitializer( - model=server_config.model, - type_models=server_config.type_models, - ) - model_pipeline = initializer.initialize_pipeline() - model_pipeline.start() - - app.state.REQUEST_PIPE = RequestScopedPipeline(model_pipeline.pipeline) - - # Lock for concurrency - pipeline_lock = threading.Lock() - - app.state.MODEL_PIPELINE = model_pipeline - app.state.PIPELINE_LOCK = pipeline_lock - app.state.MODEL_INITIALIZER = initializer - - logger.info("Pipeline initialized and ready to receive requests") - except Exception as e: - logger.error(f"Error initializing pipeline: {e}") - raise + @app.middleware("http") + async def count_requests_middleware(request: Request, call_next): + async with app.state.metrics_lock: + app.state.total_requests += 1 + response = await call_next(request) + return response @app.get("/") @@ -196,14 +243,16 @@ async def api(json: JSONBodyQueryAPI): num_steps = json.num_inference_steps num_images_per_prompt = json.num_images_per_prompt - wrapper = app.state.MODEL_PIPELINE + wrapper = app.state.MODEL_PIPELINE initializer = app.state.MODEL_INITIALIZER + utils_app = app.state.utils_app + if not wrapper or not wrapper.pipeline: - raise HTTPException(500, "Modelo no inicializado correctamente") + raise HTTPException(500, "Model not initialized correctly") if not prompt.strip(): - raise HTTPException(400, "No se proporcionó prompt") + raise HTTPException(400, "No prompt provided") def make_generator(): g = torch.Generator(device=initializer.device) @@ -212,9 +261,6 @@ def make_generator(): req_pipe = app.state.REQUEST_PIPE def infer(): - # This is called that because the RequestScoped Pipeline already internally - # handles everything necessary for inference and only the - # model pipeline needs to be passed, for example StableDiffusion3Pipeline gen = make_generator() return req_pipe.generate( prompt=prompt, @@ -226,14 +272,22 @@ def infer(): ) try: + async with app.state.metrics_lock: + app.state.active_inferences += 1 + output = await run_in_threadpool(infer) + async with app.state.metrics_lock: + app.state.active_inferences = max(0, app.state.active_inferences - 1) + urls = [utils_app.save_image(img) for img in output.images] return {"response": urls} except Exception as e: - logger.error(f"Error durante la inferencia: {e}") - raise HTTPException(500, f"Error en procesamiento: {e}") + async with app.state.metrics_lock: + app.state.active_inferences = max(0, app.state.active_inferences - 1) + logger.error(f"Error during inference: {e}") + raise HTTPException(500, f"Error in processing: {e}") finally: import gc; gc.collect() @@ -243,6 +297,7 @@ def infer(): @app.get("/images/{filename}") async def serve_image(filename: str): + utils_app = app.state.utils_app file_path = os.path.join(utils_app.image_dir, filename) if not os.path.isfile(file_path): raise HTTPException(status_code=404, detail="Image not found") diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py index 394ebac39011..8f5064c1f04a 100644 --- a/examples/server-async/DiffusersServer/superpipeline.py +++ b/examples/server-async/DiffusersServer/superpipeline.py @@ -1,5 +1,3 @@ -# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/superpipeline.py - from diffusers.pipelines import * from diffusers import * import torch diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py index 7e19b50f3cbe..437e4961f4d4 100644 --- a/examples/server-async/DiffusersServer/uvicorn_diffu.py +++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py @@ -1,5 +1,3 @@ -# from https://github.com/F4k3r22/DiffusersServer/blob/main/DiffusersServer/uvicorn_diffu.py - import uvicorn import logging import gc @@ -7,7 +5,6 @@ import os import threading import time -import string def setup_logging(): logging.basicConfig(level=logging.INFO) @@ -18,10 +15,8 @@ def setup_logging(): def memory_cleanup(interval=30): while True: try: - gc.collect() - process = psutil.Process(os.getpid()) mem = process.memory_info().rss / 1024 / 1024 logger.info(f"Memoria en uso: {mem:.2f} MB") @@ -43,23 +38,6 @@ def run_uvicorn_server( ], enable_memory_monitor=True ): - """ - Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional - - Args: - app: Aplicación FastAPI - host (str): Host donde se servirá la aplicación - port (int): Puerto para el servidor - workers (int): Número de hilos para Uvicorn - cleanup_interval (int): Intervalo de limpieza para Uvicorn - channel_timeout (int): Tiempo de espera máximo para canales - server_header (bool): Activar el identificador / Header del servidor - headers (str): Identificador del servidor / Header del servidor - enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria - - Returns: - El resultado de serve() (aunque normalmente no retorna) - """ gc.enable() gc.set_threshold(700, 10, 5) @@ -70,9 +48,9 @@ def run_uvicorn_server( daemon=True ) cleanup_thread.start() - logger.info("Monitor de memoria activado") + logger.info("Memory monitor activated") - logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...") + logger.info(f"Starting Uvicorn server in {host}:{port}...") config = uvicorn.Config( app=app, diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt index 50eeed9b2f9e..b7a30ef45da8 100644 --- a/examples/server-async/requirements.txt +++ b/examples/server-async/requirements.txt @@ -3,4 +3,5 @@ torchvision transformers sentencepiece fastapi -uvicorn \ No newline at end of file +uvicorn +fifty \ No newline at end of file From 840f0e4a7ab1d1819a6a60df200e48dd91b87d6f Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Thu, 11 Sep 2025 15:20:57 -0600 Subject: [PATCH 15/34] Fix server-async --- .../server-async/DiffusersServer/Pipelines.py | 116 +++++++++++- .../DiffusersServer/serverasync.py | 171 ++++++++++++++---- .../DiffusersServer/superpipeline.py | 8 + .../DiffusersServer/uvicorn_diffu.py | 32 +++- examples/server-async/requirements.txt | 2 +- 5 files changed, 288 insertions(+), 41 deletions(-) diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py index 66391b89560a..087c4cbd380a 100644 --- a/examples/server-async/DiffusersServer/Pipelines.py +++ b/examples/server-async/DiffusersServer/Pipelines.py @@ -7,6 +7,7 @@ import os import logging from pydantic import BaseModel +import gc logger = logging.getLogger(__name__) @@ -19,31 +20,126 @@ class TextToImageInput(BaseModel): class TextToImagePipelineSD3: def __init__(self, model_path: str | None = None): self.model_path = model_path or os.getenv("MODEL_PATH") - self.pipeline: StableDiffusion3Pipeline = None - self.device: str = None - + self.pipeline: StableDiffusion3Pipeline | None = None + self.device: str | None = None + def start(self): + torch.set_float32_matmul_precision("high") + + if hasattr(torch._inductor, 'config'): + if hasattr(torch._inductor.config, 'conv_1x1_as_mm'): + torch._inductor.config.conv_1x1_as_mm = True + if hasattr(torch._inductor.config, 'coordinate_descent_tuning'): + torch._inductor.config.coordinate_descent_tuning = True + if hasattr(torch._inductor.config, 'epilogue_fusion'): + torch._inductor.config.epilogue_fusion = False + if hasattr(torch._inductor.config, 'coordinate_descent_check_all_directions'): + torch._inductor.config.coordinate_descent_check_all_directions = True + + if torch.cuda.is_available(): + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.deterministic = False + torch.backends.cudnn.allow_tf32 = True + + if torch.cuda.is_available(): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large" - logger.info("Loading CUDA") + logger.info(f"Loading CUDA with model: {model_path}") self.device = "cuda" + + torch.cuda.empty_cache() + gc.collect() + self.pipeline = StableDiffusion3Pipeline.from_pretrained( model_path, torch_dtype=torch.float16, - ).to(device=self.device) + use_safetensors=True, + variant="fp16" if "fp16" in model_path else None, + low_cpu_mem_usage=True, + ) + + self.pipeline = self.pipeline.to(device=self.device) + + if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: + self.pipeline.transformer = self.pipeline.transformer.to( + memory_format=torch.channels_last + ) + logger.info("Transformer optimized with channels_last format") + + if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None: + self.pipeline.vae = self.pipeline.vae.to( + memory_format=torch.channels_last + ) + logger.info("VAE optimized with channels_last format") + + try: + self.pipeline.enable_xformers_memory_efficient_attention() + logger.info("XFormers memory efficient attention enabled") + except Exception as e: + logger.info(f"XFormers not available: {e}") + + # --- Se descarta torch.compile pero se mantiene el resto --- + if torch.__version__ >= "2.0.0": + logger.info("Skipping torch.compile - running without compile optimizations by design") + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + logger.info("CUDA pipeline fully optimized and ready") + elif torch.backends.mps.is_available(): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium" - logger.info("Loading MPS for Mac M Series") + logger.info(f"Loading MPS for Mac M Series with model: {model_path}") self.device = "mps" self.pipeline = StableDiffusion3Pipeline.from_pretrained( model_path, torch_dtype=torch.bfloat16, + use_safetensors=True, + low_cpu_mem_usage=True, ).to(device=self.device) + + if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: + self.pipeline.transformer = self.pipeline.transformer.to( + memory_format=torch.channels_last + ) + + if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None: + self.pipeline.vae = self.pipeline.vae.to( + memory_format=torch.channels_last + ) + + + logger.info("MPS pipeline optimized and ready") + else: raise Exception("No CUDA or MPS device available") + + # OPTIONAL WARMUP + self._warmup() + + logger.info("Pipeline initialization completed successfully") + + def _warmup(self): + if self.pipeline: + logger.info("Running warmup inference...") + with torch.no_grad(): + _ = self.pipeline( + prompt="warmup", + num_inference_steps=1, + height=512, + width=512, + guidance_scale=1.0, + ) + torch.cuda.empty_cache() if self.device == "cuda" else None + logger.info("Warmup completed") class TextToImagePipelineFlux: def __init__(self, model_path: str | None = None, low_vram: bool = False): + """ + Inicialización de la clase con la ruta del modelo. + Si no se proporciona, se obtiene de la variable de entorno. + """ self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: FluxPipeline = None self.device: str = None @@ -51,6 +147,7 @@ def __init__(self, model_path: str | None = None, low_vram: bool = False): def start(self): if torch.cuda.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading CUDA") self.device = "cuda" @@ -63,6 +160,7 @@ def start(self): else: pass elif torch.backends.mps.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading MPS for Mac M Series") self.device = "mps" @@ -75,12 +173,17 @@ def start(self): class TextToImagePipelineSD: def __init__(self, model_path: str | None = None): + """ + Inicialización de la clase con la ruta del modelo. + Si no se proporciona, se obtiene de la variable de entorno. + """ self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: StableDiffusionPipeline = None self.device: str = None def start(self): if torch.cuda.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading CUDA") self.device = "cuda" @@ -89,6 +192,7 @@ def start(self): torch_dtype=torch.float16, ).to(device=self.device) elif torch.backends.mps.is_available(): + # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading MPS for Mac M Series") self.device = "mps" diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index 78e1d44f4119..e7e056786c5d 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -1,3 +1,5 @@ +# Voy a mudar todo el servidor a un servidor asincrono con FastAPI y Uvicorn +# Mientras complete esto, el servidor actual sigue funcionando from fastapi import FastAPI, HTTPException, Request from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware @@ -22,6 +24,17 @@ from typing import List from contextlib import asynccontextmanager import asyncio +from PIL import Image + +""" +The goal is to create image generation, editing, and variance endpoints compatible with the OpenAI client. + +APIs: + +POST /images/variations (create_variation) +POST /images/edits (edit) +POST /images/generations (generate) +""" @dataclass class PresetModels: @@ -80,30 +93,96 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500): if not os.path.exists(self.video_dir): os.makedirs(self.video_dir) - def save_image(self, image): - if hasattr(image, "to"): - try: - image = image.to("cpu") - except Exception: - pass + def _tensor_to_pil_minimal(self, tensor: torch.Tensor) -> Image.Image: + """ + Convertir tensor GPU->PIL minimizando copias: + - sincroniza GPU + - mueve a CPU non_blocking (requiere pinned memory para ser efectivo) + - hace contiguous una sola vez + - convierte a uint8 una sola vez + """ + # Acepta [N,C,H,W] o [C,H,W] + t = tensor + if t.ndim == 4: + t = t[0] + + # Asegurar que GPU terminó + if t.is_cuda: + torch.cuda.synchronize() + + # Mover a CPU (non_blocking where possible) y hacer contiguous + cpu_t = t.detach().to("cpu", non_blocking=True).contiguous() + + # Normalizar y convertir a uint8. Asumo rango [0,1]. Si tu pipeline devuelve [-1,1] + # usar: cpu_t = (cpu_t + 1) / 2 + cpu_t = cpu_t.clamp(0, 1).mul(255).to(torch.uint8) - if isinstance(image, torch.Tensor): - from torchvision import transforms - to_pil = transforms.ToPILImage() - image = to_pil(image.squeeze(0).clamp(0, 1)) + # reordenar a H,W,C y extraer numpy (una copia inevitable) + arr = cpu_t.permute(1, 2, 0).numpy() + pil = Image.fromarray(arr) + + # cleanup variables intermedias (liberar memoria lo antes posible) + try: + del arr, cpu_t, t + except Exception: + pass + + return pil + + def save_image(self, image): filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" image_path = os.path.join(self.image_dir, filename) logger.info(f"Saving image to {image_path}") - image.save(image_path, format="PNG", optimize=True) + try: + # Si ya es PIL, guardar directo + if isinstance(image, Image.Image): + image.save(image_path, format="PNG", optimize=True) + # liberar referencia + del image + else: + # Si tiene método to (posible tensor o wrapper), intentar mover a cpu primero (seguro) + if hasattr(image, "to") and isinstance(image, torch.Tensor): + # Convertir tensor -> PIL minimizando copias + pil = self._tensor_to_pil_minimal(image) + # Guardar con lock/serialización (see usage in endpoint) + pil.save(image_path, format="PNG", optimize=True) + del pil + else: + # Fallback: si no es tensor ni PIL, intenta convertir via torchvision + try: + from torchvision import transforms + to_pil = transforms.ToPILImage() + pil = to_pil(image.squeeze(0).clamp(0, 1)) + pil.save(image_path, format="PNG", optimize=True) + del pil + except Exception as e: + raise RuntimeError(f"Unsupported image object for saving: {e}") + + # cleanup agresivo + gc.collect() + if torch.cuda.is_available(): + # sincronizar y limpiar caches GPU para evitar buffers retenidos + try: + torch.cuda.synchronize() + except Exception: + pass + torch.cuda.empty_cache() - del image - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() + return os.path.join(self.service_url, "images", filename) - return os.path.join(self.service_url, "images", filename) + except Exception as e: + # intentar limpiar en caso de error + try: + del image + except Exception: + pass + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.error(f"Error saving image: {e}") + raise def save_video(self, video, fps): filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4" @@ -114,11 +193,11 @@ def save_video(self, video, fps): @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' - type_models: str = 't2im' + model: str = 'stabilityai/stable-diffusion-3-medium' # Valor predeterminado + type_models: str = 't2im' # Solo hay t2im y t2v custom_model : bool = False constructor_pipeline: Optional[Type] = None - custom_pipeline: Optional[Type] = None + custom_pipeline: Optional[Type] = None # Añadimos valor por defecto components: Optional[Dict[str, Any]] = None api_name: Optional[str] = 'custom_api' torch_dtype: Optional[torch.dtype] = None @@ -139,6 +218,9 @@ async def lifespan(app: FastAPI): app.state.metrics_lock = asyncio.Lock() app.state.metrics_task = None + # Guardar modelo ya inicializado + + # Inicializar utils app.state.utils_app = Utils( host=server_config.host, port=server_config.port, @@ -157,6 +239,9 @@ async def metrics_loop(): raise app.state.metrics_task = asyncio.create_task(metrics_loop()) + from concurrent.futures import ThreadPoolExecutor + + app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1) try: yield @@ -170,6 +255,7 @@ async def metrics_loop(): except asyncio.CancelledError: pass + # Intentar liberar pipeline si tiene stop/close try: stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None) if callable(stop_fn): @@ -210,7 +296,7 @@ async def metrics_loop(): request_pipe = RequestScopedPipeline(model_pipeline.pipeline) pipeline_lock = threading.Lock() - logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})") + logger.info(f"Pipeline inicializado y listo para recibir solicitudes (modelo={server_config.model})") app.state.MODEL_INITIALIZER = initializer app.state.MODEL_PIPELINE = model_pipeline @@ -245,21 +331,18 @@ async def api(json: JSONBodyQueryAPI): wrapper = app.state.MODEL_PIPELINE initializer = app.state.MODEL_INITIALIZER - - utils_app = app.state.utils_app - + utils_app = app.state.utils_app + req_pipe = app.state.REQUEST_PIPE if not wrapper or not wrapper.pipeline: - raise HTTPException(500, "Model not initialized correctly") + raise HTTPException(500, "Modelo no inicializado correctamente") if not prompt.strip(): - raise HTTPException(400, "No prompt provided") + raise HTTPException(400, "No se proporcionó prompt") def make_generator(): g = torch.Generator(device=initializer.device) return g.manual_seed(random.randint(0, 10_000_000)) - req_pipe = app.state.REQUEST_PIPE - def infer(): gen = make_generator() return req_pipe.generate( @@ -277,20 +360,44 @@ def infer(): output = await run_in_threadpool(infer) + saved_urls = [] + loop = asyncio.get_running_loop() + + images = getattr(output, "images", []) or [] + for idx, img in enumerate(images): + try: + url = await loop.run_in_executor(app.state.SAVE_EXECUTOR, utils_app.save_image, img) + saved_urls.append(url) + except Exception as e: + logger.error(f"Error guardando imagen {idx}: {e}") + finally: + try: + del img + except Exception: + pass + import gc + gc.collect() + if torch.cuda.is_available(): + try: + torch.cuda.synchronize() + except Exception: + pass + torch.cuda.empty_cache() + async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) - urls = [utils_app.save_image(img) for img in output.images] - return {"response": urls} + return {"response": saved_urls} except Exception as e: async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) - logger.error(f"Error during inference: {e}") - raise HTTPException(500, f"Error in processing: {e}") + logger.error(f"Error durante la inferencia: {e}") + raise HTTPException(500, f"Error en procesamiento: {e}") finally: - import gc; gc.collect() + import gc + gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py index 8f5064c1f04a..4e2bb9452c4a 100644 --- a/examples/server-async/DiffusersServer/superpipeline.py +++ b/examples/server-async/DiffusersServer/superpipeline.py @@ -11,6 +11,14 @@ def __init__(self, model_path: str, pipeline: Type, torch_dtype = torch.bfloat16, components: Optional[Dict[str, Any]] = None,): + """ + Clase para crear tus Pipelines personalizados para tu API custom + Args: + model_path: Ruta o nombre del modelo + pipeline: Clase del pipeline a utilizar + torch_dtype: Tipo de datos de PyTorch a utilizar + components: Diccionario de componentes personalizados + """ self.model_path = model_path self.pipeline = pipeline self.torch_dtype = torch_dtype diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py index 437e4961f4d4..faefc5c2f0ee 100644 --- a/examples/server-async/DiffusersServer/uvicorn_diffu.py +++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py @@ -5,7 +5,9 @@ import os import threading import time +import string +# Configuración de logging def setup_logging(): logging.basicConfig(level=logging.INFO) return logging.getLogger('uvicorn') @@ -13,10 +15,18 @@ def setup_logging(): logger = setup_logging() def memory_cleanup(interval=30): + """ + Función para monitorear y limpiar la memoria periódicamente + + Args: + interval (int): Intervalo en segundos entre limpiezas + """ while True: try: + # Forzar recolección de basura gc.collect() + # Obtener información de memoria actual process = psutil.Process(os.getpid()) mem = process.memory_info().rss / 1024 / 1024 logger.info(f"Memoria en uso: {mem:.2f} MB") @@ -38,9 +48,27 @@ def run_uvicorn_server( ], enable_memory_monitor=True ): + """ + Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional + + Args: + app: Aplicación FastAPI + host (str): Host donde se servirá la aplicación + port (int): Puerto para el servidor + workers (int): Número de hilos para Uvicorn + cleanup_interval (int): Intervalo de limpieza para Uvicorn + channel_timeout (int): Tiempo de espera máximo para canales + server_header (bool): Activar el identificador / Header del servidor + headers (str): Identificador del servidor / Header del servidor + enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria + + Returns: + El resultado de serve() (aunque normalmente no retorna) + """ gc.enable() gc.set_threshold(700, 10, 5) + # Iniciar monitoreo de memoria si está habilitado if enable_memory_monitor: cleanup_thread = threading.Thread( target=memory_cleanup, @@ -48,9 +76,9 @@ def run_uvicorn_server( daemon=True ) cleanup_thread.start() - logger.info("Memory monitor activated") + logger.info("Monitor de memoria activado") - logger.info(f"Starting Uvicorn server in {host}:{port}...") + logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...") config = uvicorn.Config( app=app, diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt index b7a30ef45da8..d5a3746c235b 100644 --- a/examples/server-async/requirements.txt +++ b/examples/server-async/requirements.txt @@ -4,4 +4,4 @@ transformers sentencepiece fastapi uvicorn -fifty \ No newline at end of file +ftfy \ No newline at end of file From ed617fe154e2adf996be77dd6eef86f0d74e4a02 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Fri, 12 Sep 2025 21:07:19 -0600 Subject: [PATCH 16/34] Optimizations in examples/server-async --- .../server-async/DiffusersServer/Pipelines.py | 41 ++- .../DiffusersServer/serverasync.py | 290 +++++++++--------- examples/server-async/requirements.txt | 5 +- 3 files changed, 178 insertions(+), 158 deletions(-) diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py index 087c4cbd380a..60be11b2f241 100644 --- a/examples/server-async/DiffusersServer/Pipelines.py +++ b/examples/server-async/DiffusersServer/Pipelines.py @@ -42,7 +42,6 @@ def start(self): torch.backends.cudnn.deterministic = False torch.backends.cudnn.allow_tf32 = True - if torch.cuda.is_available(): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large" logger.info(f"Loading CUDA with model: {model_path}") @@ -61,6 +60,14 @@ def start(self): self.pipeline = self.pipeline.to(device=self.device) + if hasattr(self.pipeline, 'enable_vae_slicing'): + self.pipeline.enable_vae_slicing() + logger.info("VAE slicing enabled - will reduce memory spikes during decoding") + + if hasattr(self.pipeline, 'enable_vae_tiling'): + self.pipeline.enable_vae_tiling() + logger.info("VAE tiling enabled - will allow processing larger images") + if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: self.pipeline.transformer = self.pipeline.transformer.to( memory_format=torch.channels_last @@ -71,6 +78,15 @@ def start(self): self.pipeline.vae = self.pipeline.vae.to( memory_format=torch.channels_last ) + + if hasattr(self.pipeline.vae, 'enable_slicing'): + self.pipeline.vae.enable_slicing() + logger.info("VAE slicing activated directly in the VAE") + + if hasattr(self.pipeline.vae, 'enable_tiling'): + self.pipeline.vae.enable_tiling() + logger.info("VAE tiling activated directly on the VAE") + logger.info("VAE optimized with channels_last format") try: @@ -79,9 +95,7 @@ def start(self): except Exception as e: logger.info(f"XFormers not available: {e}") - # --- Se descarta torch.compile pero se mantiene el resto --- - if torch.__version__ >= "2.0.0": - logger.info("Skipping torch.compile - running without compile optimizations by design") + logger.info("Skipping torch.compile - running without compile optimizations by design") if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -92,13 +106,18 @@ def start(self): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium" logger.info(f"Loading MPS for Mac M Series with model: {model_path}") self.device = "mps" + self.pipeline = StableDiffusion3Pipeline.from_pretrained( model_path, torch_dtype=torch.bfloat16, use_safetensors=True, low_cpu_mem_usage=True, ).to(device=self.device) - + + if hasattr(self.pipeline, 'enable_vae_slicing'): + self.pipeline.enable_vae_slicing() + logger.info("VAE slicing enabled in MPS") + if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: self.pipeline.transformer = self.pipeline.transformer.to( memory_format=torch.channels_last @@ -108,14 +127,13 @@ def start(self): self.pipeline.vae = self.pipeline.vae.to( memory_format=torch.channels_last ) - logger.info("MPS pipeline optimized and ready") else: raise Exception("No CUDA or MPS device available") - # OPTIONAL WARMUP + self._warmup() logger.info("Pipeline initialization completed successfully") @@ -131,8 +149,13 @@ def _warmup(self): width=512, guidance_scale=1.0, ) - torch.cuda.empty_cache() if self.device == "cuda" else None - logger.info("Warmup completed") + + if self.device == "cuda": + torch.cuda.synchronize() + torch.cuda.empty_cache() + + gc.collect() + logger.info("Warmup completed with memory cleanup") class TextToImagePipelineFlux: def __init__(self, model_path: str | None = None, low_vram: bool = False): diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index e7e056786c5d..a3392500d9f6 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -1,5 +1,3 @@ -# Voy a mudar todo el servidor a un servidor asincrono con FastAPI y Uvicorn -# Mientras complete esto, el servidor actual sigue funcionando from fastapi import FastAPI, HTTPException, Request from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware @@ -26,16 +24,6 @@ import asyncio from PIL import Image -""" -The goal is to create image generation, editing, and variance endpoints compatible with the OpenAI client. - -APIs: - -POST /images/variations (create_variation) -POST /images/edits (edit) -POST /images/generations (generate) -""" - @dataclass class PresetModels: SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) @@ -93,111 +81,114 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500): if not os.path.exists(self.video_dir): os.makedirs(self.video_dir) - def _tensor_to_pil_minimal(self, tensor: torch.Tensor) -> Image.Image: - """ - Convertir tensor GPU->PIL minimizando copias: - - sincroniza GPU - - mueve a CPU non_blocking (requiere pinned memory para ser efectivo) - - hace contiguous una sola vez - - convierte a uint8 una sola vez - """ - # Acepta [N,C,H,W] o [C,H,W] - t = tensor - if t.ndim == 4: - t = t[0] - - # Asegurar que GPU terminó - if t.is_cuda: - torch.cuda.synchronize() - - # Mover a CPU (non_blocking where possible) y hacer contiguous - cpu_t = t.detach().to("cpu", non_blocking=True).contiguous() - - # Normalizar y convertir a uint8. Asumo rango [0,1]. Si tu pipeline devuelve [-1,1] - # usar: cpu_t = (cpu_t + 1) / 2 - cpu_t = cpu_t.clamp(0, 1).mul(255).to(torch.uint8) - - # reordenar a H,W,C y extraer numpy (una copia inevitable) - arr = cpu_t.permute(1, 2, 0).numpy() - - pil = Image.fromarray(arr) + from concurrent.futures import ThreadPoolExecutor + self.executor = ThreadPoolExecutor(max_workers=2) - # cleanup variables intermedias (liberar memoria lo antes posible) + def _save_pil_image(self, pil_image: Image.Image, filepath: str): try: - del arr, cpu_t, t - except Exception: - pass - - return pil - - def save_image(self, image): - filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" - image_path = os.path.join(self.image_dir, filename) - logger.info(f"Saving image to {image_path}") - + pil_image.save(filepath, format="PNG", optimize=True, compress_level=6) + except Exception as e: + logger.error(f"Error saving PIL image: {e}") + raise + finally: + if pil_image: + pil_image.close() + del pil_image + + def _tensor_to_pil_optimized(self, tensor: torch.Tensor) -> Image.Image: + + with torch.no_grad(): + tensor_cpu = tensor.detach().clone() + + if tensor_cpu.is_cuda: + tensor_cpu = tensor_cpu.cpu() + torch.cuda.synchronize() + + if tensor_cpu.dim() == 4: + tensor_cpu = tensor_cpu[0] + + tensor_cpu = tensor_cpu.clamp(0, 1).mul(255).byte() + + if tensor_cpu.shape[0] in [1, 3, 4]: + tensor_cpu = tensor_cpu.permute(1, 2, 0) + + np_array = tensor_cpu.contiguous().numpy() + + del tensor_cpu + + if np_array.shape[-1] == 1: + np_array = np_array.squeeze(-1) + mode = 'L' + elif np_array.shape[-1] == 3: + mode = 'RGB' + elif np_array.shape[-1] == 4: + mode = 'RGBA' + else: + raise ValueError(f"Unsupported number of channels: {np_array.shape[-1]}") + + pil_image = Image.fromarray(np_array, mode=mode) + + del np_array + + return pil_image + + async def save_image(self, image) -> str: + + image_id = str(uuid.uuid4()).split("-")[0] + filename = f"img{image_id}.png" + filepath = os.path.join(self.image_dir, filename) + url = os.path.join(self.service_url, "images", filename) + + loop = asyncio.get_event_loop() + try: - # Si ya es PIL, guardar directo if isinstance(image, Image.Image): - image.save(image_path, format="PNG", optimize=True) - # liberar referencia - del image + await loop.run_in_executor( + self.executor, + self._save_pil_image, + image, + filepath + ) + + elif isinstance(image, torch.Tensor): + with torch.no_grad(): + pil_image = await loop.run_in_executor( + None, + self._tensor_to_pil_optimized, + image + ) + + await loop.run_in_executor( + self.executor, + self._save_pil_image, + pil_image, + filepath + ) + + del pil_image + else: - # Si tiene método to (posible tensor o wrapper), intentar mover a cpu primero (seguro) - if hasattr(image, "to") and isinstance(image, torch.Tensor): - # Convertir tensor -> PIL minimizando copias - pil = self._tensor_to_pil_minimal(image) - # Guardar con lock/serialización (see usage in endpoint) - pil.save(image_path, format="PNG", optimize=True) - del pil - else: - # Fallback: si no es tensor ni PIL, intenta convertir via torchvision - try: - from torchvision import transforms - to_pil = transforms.ToPILImage() - pil = to_pil(image.squeeze(0).clamp(0, 1)) - pil.save(image_path, format="PNG", optimize=True) - del pil - except Exception as e: - raise RuntimeError(f"Unsupported image object for saving: {e}") - - # cleanup agresivo - gc.collect() - if torch.cuda.is_available(): - # sincronizar y limpiar caches GPU para evitar buffers retenidos - try: - torch.cuda.synchronize() - except Exception: - pass - torch.cuda.empty_cache() - - return os.path.join(self.service_url, "images", filename) - + raise ValueError(f"Unsupported image type: {type(image)}") + + logger.debug(f"Image saved: {filename}") + return url + except Exception as e: - # intentar limpiar en caso de error - try: - del image - except Exception: - pass - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - logger.error(f"Error saving image: {e}") + logger.error(f"Error in save_image_optimized: {e}") raise - - def save_video(self, video, fps): - filename = "video" + str(uuid.uuid4()).split("-")[0] + ".mp4" - video_path = os.path.join(self.video_dir, filename) - export = export_to_video(video, video_path, fps=fps) - logger.info(f"Saving video to {video_path}") - return os.path.join(self.service_url, "video", filename) + finally: + gc.collect() + + def shutdown(self): + self.executor.shutdown(wait=True) @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' # Valor predeterminado - type_models: str = 't2im' # Solo hay t2im y t2v + model: str = 'stabilityai/stable-diffusion-3-medium' + type_models: str = 't2im' custom_model : bool = False constructor_pipeline: Optional[Type] = None - custom_pipeline: Optional[Type] = None # Añadimos valor por defecto + custom_pipeline: Optional[Type] = None components: Optional[Dict[str, Any]] = None api_name: Optional[str] = 'custom_api' torch_dtype: Optional[torch.dtype] = None @@ -218,9 +209,6 @@ async def lifespan(app: FastAPI): app.state.metrics_lock = asyncio.Lock() app.state.metrics_task = None - # Guardar modelo ya inicializado - - # Inicializar utils app.state.utils_app = Utils( host=server_config.host, port=server_config.port, @@ -240,13 +228,11 @@ async def metrics_loop(): app.state.metrics_task = asyncio.create_task(metrics_loop()) from concurrent.futures import ThreadPoolExecutor - app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1) try: yield finally: - # 🔻 shutdown task = app.state.metrics_task if task: task.cancel() @@ -255,7 +241,6 @@ async def metrics_loop(): except asyncio.CancelledError: pass - # Intentar liberar pipeline si tiene stop/close try: stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None) if callable(stop_fn): @@ -265,8 +250,6 @@ async def metrics_loop(): app.state.logger.info("Lifespan shutdown complete") - - app = FastAPI(lifespan=lifespan) logger = logging.getLogger("DiffusersServer.Pipelines") @@ -296,7 +279,7 @@ async def metrics_loop(): request_pipe = RequestScopedPipeline(model_pipeline.pipeline) pipeline_lock = threading.Lock() - logger.info(f"Pipeline inicializado y listo para recibir solicitudes (modelo={server_config.model})") + logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})") app.state.MODEL_INITIALIZER = initializer app.state.MODEL_PIPELINE = model_pipeline @@ -335,9 +318,9 @@ async def api(json: JSONBodyQueryAPI): req_pipe = app.state.REQUEST_PIPE if not wrapper or not wrapper.pipeline: - raise HTTPException(500, "Modelo no inicializado correctamente") + raise HTTPException(500, "Model not initialized correctly") if not prompt.strip(): - raise HTTPException(400, "No se proporcionó prompt") + raise HTTPException(400, "No prompt provided") def make_generator(): g = torch.Generator(device=initializer.device) @@ -345,45 +328,56 @@ def make_generator(): def infer(): gen = make_generator() - return req_pipe.generate( - prompt=prompt, - negative_prompt=negative_prompt, - generator=gen, - num_inference_steps=num_steps, - num_images_per_prompt=num_images_per_prompt, - device=initializer.device - ) + + # Maybe this will improve some performance (I'll test it) + with torch.no_grad(): + output = req_pipe.generate( + prompt=prompt, + negative_prompt=negative_prompt, + generator=gen, + num_inference_steps=num_steps, + num_images_per_prompt=num_images_per_prompt, + device=initializer.device + ) + + return output try: async with app.state.metrics_lock: app.state.active_inferences += 1 output = await run_in_threadpool(infer) - - saved_urls = [] - loop = asyncio.get_running_loop() - + images = getattr(output, "images", []) or [] - for idx, img in enumerate(images): + + saved_urls = [] + + for i, img in enumerate(images): try: - url = await loop.run_in_executor(app.state.SAVE_EXECUTOR, utils_app.save_image, img) + + url = await utils_app.save_image(img) saved_urls.append(url) - except Exception as e: - logger.error(f"Error guardando imagen {idx}: {e}") - finally: - try: - del img - except Exception: - pass - import gc - gc.collect() + + if isinstance(img, Image.Image): + img.close() + del img + if torch.cuda.is_available(): - try: - torch.cuda.synchronize() - except Exception: - pass - torch.cuda.empty_cache() + torch.cuda.synchronize() + + except Exception as e: + logger.error(f"Error saving image {i}: {e}") + continue + + del output, images + + if torch.cuda.is_available(): + torch.cuda.synchronize() + torch.cuda.empty_cache() + + gc.collect() + async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) @@ -392,14 +386,14 @@ def infer(): except Exception as e: async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) - logger.error(f"Error durante la inferencia: {e}") - raise HTTPException(500, f"Error en procesamiento: {e}") + logger.error(f"Error during inference: {e}") + raise HTTPException(500, f"Error in processing: {e}") finally: - import gc - gc.collect() if torch.cuda.is_available(): + torch.cuda.synchronize() torch.cuda.empty_cache() + gc.collect() @app.get("/images/{filename}") diff --git a/examples/server-async/requirements.txt b/examples/server-async/requirements.txt index d5a3746c235b..aafa93b7023f 100644 --- a/examples/server-async/requirements.txt +++ b/examples/server-async/requirements.txt @@ -4,4 +4,7 @@ transformers sentencepiece fastapi uvicorn -ftfy \ No newline at end of file +ftfy +accelerate +xformers +protobuf \ No newline at end of file From b052d27fd7390d30a79d51f9a8b55b62d154b36e Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 13 Sep 2025 22:04:14 -0600 Subject: [PATCH 17/34] We keep the implementation simple in examples/server-async --- .../server-async/DiffusersServer/Pipelines.py | 146 +------------ .../server-async/DiffusersServer/__init__.py | 1 - .../DiffusersServer/serverasync.py | 195 ++++-------------- .../DiffusersServer/superpipeline.py | 50 ----- .../DiffusersServer/uvicorn_diffu.py | 36 +--- 5 files changed, 58 insertions(+), 370 deletions(-) delete mode 100644 examples/server-async/DiffusersServer/superpipeline.py diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/DiffusersServer/Pipelines.py index 60be11b2f241..bc60d4811c3e 100644 --- a/examples/server-async/DiffusersServer/Pipelines.py +++ b/examples/server-async/DiffusersServer/Pipelines.py @@ -1,5 +1,4 @@ # Pipelines.py - from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline from diffusers.pipelines.flux.pipeline_flux import FluxPipeline from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline @@ -7,7 +6,6 @@ import os import logging from pydantic import BaseModel -import gc logger = logging.getLogger(__name__) @@ -22,155 +20,36 @@ def __init__(self, model_path: str | None = None): self.model_path = model_path or os.getenv("MODEL_PATH") self.pipeline: StableDiffusion3Pipeline | None = None self.device: str | None = None - + def start(self): - torch.set_float32_matmul_precision("high") - - if hasattr(torch._inductor, 'config'): - if hasattr(torch._inductor.config, 'conv_1x1_as_mm'): - torch._inductor.config.conv_1x1_as_mm = True - if hasattr(torch._inductor.config, 'coordinate_descent_tuning'): - torch._inductor.config.coordinate_descent_tuning = True - if hasattr(torch._inductor.config, 'epilogue_fusion'): - torch._inductor.config.epilogue_fusion = False - if hasattr(torch._inductor.config, 'coordinate_descent_check_all_directions'): - torch._inductor.config.coordinate_descent_check_all_directions = True - - if torch.cuda.is_available(): - torch.backends.cudnn.benchmark = True - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.deterministic = False - torch.backends.cudnn.allow_tf32 = True - if torch.cuda.is_available(): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large" - logger.info(f"Loading CUDA with model: {model_path}") + logger.info("Loading CUDA") self.device = "cuda" - - torch.cuda.empty_cache() - gc.collect() - self.pipeline = StableDiffusion3Pipeline.from_pretrained( model_path, torch_dtype=torch.float16, - use_safetensors=True, - variant="fp16" if "fp16" in model_path else None, - low_cpu_mem_usage=True, - ) - - self.pipeline = self.pipeline.to(device=self.device) - - if hasattr(self.pipeline, 'enable_vae_slicing'): - self.pipeline.enable_vae_slicing() - logger.info("VAE slicing enabled - will reduce memory spikes during decoding") - - if hasattr(self.pipeline, 'enable_vae_tiling'): - self.pipeline.enable_vae_tiling() - logger.info("VAE tiling enabled - will allow processing larger images") - - if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: - self.pipeline.transformer = self.pipeline.transformer.to( - memory_format=torch.channels_last - ) - logger.info("Transformer optimized with channels_last format") - - if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None: - self.pipeline.vae = self.pipeline.vae.to( - memory_format=torch.channels_last - ) - - if hasattr(self.pipeline.vae, 'enable_slicing'): - self.pipeline.vae.enable_slicing() - logger.info("VAE slicing activated directly in the VAE") - - if hasattr(self.pipeline.vae, 'enable_tiling'): - self.pipeline.vae.enable_tiling() - logger.info("VAE tiling activated directly on the VAE") - - logger.info("VAE optimized with channels_last format") - - try: - self.pipeline.enable_xformers_memory_efficient_attention() - logger.info("XFormers memory efficient attention enabled") - except Exception as e: - logger.info(f"XFormers not available: {e}") - - logger.info("Skipping torch.compile - running without compile optimizations by design") - - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - logger.info("CUDA pipeline fully optimized and ready") - + ).to(device=self.device) elif torch.backends.mps.is_available(): model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium" - logger.info(f"Loading MPS for Mac M Series with model: {model_path}") + logger.info("Loading MPS for Mac M Series") self.device = "mps" - self.pipeline = StableDiffusion3Pipeline.from_pretrained( model_path, torch_dtype=torch.bfloat16, - use_safetensors=True, - low_cpu_mem_usage=True, ).to(device=self.device) - - if hasattr(self.pipeline, 'enable_vae_slicing'): - self.pipeline.enable_vae_slicing() - logger.info("VAE slicing enabled in MPS") - - if hasattr(self.pipeline, 'transformer') and self.pipeline.transformer is not None: - self.pipeline.transformer = self.pipeline.transformer.to( - memory_format=torch.channels_last - ) - - if hasattr(self.pipeline, 'vae') and self.pipeline.vae is not None: - self.pipeline.vae = self.pipeline.vae.to( - memory_format=torch.channels_last - ) - - logger.info("MPS pipeline optimized and ready") - else: raise Exception("No CUDA or MPS device available") - - - self._warmup() - - logger.info("Pipeline initialization completed successfully") - - def _warmup(self): - if self.pipeline: - logger.info("Running warmup inference...") - with torch.no_grad(): - _ = self.pipeline( - prompt="warmup", - num_inference_steps=1, - height=512, - width=512, - guidance_scale=1.0, - ) - - if self.device == "cuda": - torch.cuda.synchronize() - torch.cuda.empty_cache() - - gc.collect() - logger.info("Warmup completed with memory cleanup") class TextToImagePipelineFlux: def __init__(self, model_path: str | None = None, low_vram: bool = False): - """ - Inicialización de la clase con la ruta del modelo. - Si no se proporciona, se obtiene de la variable de entorno. - """ self.model_path = model_path or os.getenv("MODEL_PATH") - self.pipeline: FluxPipeline = None - self.device: str = None + self.pipeline: FluxPipeline | None = None + self.device: str | None = None self.low_vram = low_vram def start(self): if torch.cuda.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading CUDA") self.device = "cuda" @@ -183,7 +62,6 @@ def start(self): else: pass elif torch.backends.mps.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" logger.info("Loading MPS for Mac M Series") self.device = "mps" @@ -196,17 +74,12 @@ def start(self): class TextToImagePipelineSD: def __init__(self, model_path: str | None = None): - """ - Inicialización de la clase con la ruta del modelo. - Si no se proporciona, se obtiene de la variable de entorno. - """ self.model_path = model_path or os.getenv("MODEL_PATH") - self.pipeline: StableDiffusionPipeline = None - self.device: str = None + self.pipeline: StableDiffusionPipeline | None = None + self.device: str | None = None def start(self): if torch.cuda.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para CUDA. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading CUDA") self.device = "cuda" @@ -215,7 +88,6 @@ def start(self): torch_dtype=torch.float16, ).to(device=self.device) elif torch.backends.mps.is_available(): - # Si no se definió model_path, se asigna el valor por defecto para MPS. model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" logger.info("Loading MPS for Mac M Series") self.device = "mps" @@ -224,4 +96,4 @@ def start(self): torch_dtype=torch.float16, ).to(device=self.device) else: - raise Exception("No CUDA or MPS device available") \ No newline at end of file + raise Exception("No CUDA or MPS device available") diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py index d4dc75b71a1f..0d8d5761a939 100644 --- a/examples/server-async/DiffusersServer/__init__.py +++ b/examples/server-async/DiffusersServer/__init__.py @@ -1,3 +1,2 @@ from .Pipelines import TextToImagePipelineSD3 -from .superpipeline import SuperPipelinesT2Img from .create_server import create_inference_server_Async as DiffusersServerApp \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index a3392500d9f6..ff0e64080d81 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -5,7 +5,6 @@ from pydantic import BaseModel from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD import logging -from diffusers.utils.export_utils import export_to_video from diffusers.pipelines.pipeline_utils import RequestScopedPipeline from diffusers import * from .superpipeline import * @@ -22,7 +21,6 @@ from typing import List from contextlib import asynccontextmanager import asyncio -from PIL import Image @dataclass class PresetModels: @@ -81,106 +79,30 @@ def __init__(self, host: str = '0.0.0.0', port: int = 8500): if not os.path.exists(self.video_dir): os.makedirs(self.video_dir) - from concurrent.futures import ThreadPoolExecutor - self.executor = ThreadPoolExecutor(max_workers=2) + def save_image(self, image): + if hasattr(image, "to"): + try: + image = image.to("cpu") + except Exception: + pass - def _save_pil_image(self, pil_image: Image.Image, filepath: str): - try: - pil_image.save(filepath, format="PNG", optimize=True, compress_level=6) - except Exception as e: - logger.error(f"Error saving PIL image: {e}") - raise - finally: - if pil_image: - pil_image.close() - del pil_image - - def _tensor_to_pil_optimized(self, tensor: torch.Tensor) -> Image.Image: - - with torch.no_grad(): - tensor_cpu = tensor.detach().clone() - - if tensor_cpu.is_cuda: - tensor_cpu = tensor_cpu.cpu() - torch.cuda.synchronize() - - if tensor_cpu.dim() == 4: - tensor_cpu = tensor_cpu[0] - - tensor_cpu = tensor_cpu.clamp(0, 1).mul(255).byte() - - if tensor_cpu.shape[0] in [1, 3, 4]: - tensor_cpu = tensor_cpu.permute(1, 2, 0) - - np_array = tensor_cpu.contiguous().numpy() - - del tensor_cpu - - if np_array.shape[-1] == 1: - np_array = np_array.squeeze(-1) - mode = 'L' - elif np_array.shape[-1] == 3: - mode = 'RGB' - elif np_array.shape[-1] == 4: - mode = 'RGBA' - else: - raise ValueError(f"Unsupported number of channels: {np_array.shape[-1]}") - - pil_image = Image.fromarray(np_array, mode=mode) - - del np_array - - return pil_image - - async def save_image(self, image) -> str: - - image_id = str(uuid.uuid4()).split("-")[0] - filename = f"img{image_id}.png" - filepath = os.path.join(self.image_dir, filename) - url = os.path.join(self.service_url, "images", filename) - - loop = asyncio.get_event_loop() - - try: - if isinstance(image, Image.Image): - await loop.run_in_executor( - self.executor, - self._save_pil_image, - image, - filepath - ) - - elif isinstance(image, torch.Tensor): - with torch.no_grad(): - pil_image = await loop.run_in_executor( - None, - self._tensor_to_pil_optimized, - image - ) - - await loop.run_in_executor( - self.executor, - self._save_pil_image, - pil_image, - filepath - ) - - del pil_image - - else: - raise ValueError(f"Unsupported image type: {type(image)}") - - logger.debug(f"Image saved: {filename}") - return url - - except Exception as e: - logger.error(f"Error in save_image_optimized: {e}") - raise - finally: - gc.collect() - - def shutdown(self): - self.executor.shutdown(wait=True) + if isinstance(image, torch.Tensor): + from torchvision import transforms + to_pil = transforms.ToPILImage() + image = to_pil(image.squeeze(0).clamp(0, 1)) + + filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" + image_path = os.path.join(self.image_dir, filename) + logger.info(f"Saving image to {image_path}") + + image.save(image_path, format="PNG", optimize=True) + + del image + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return os.path.join(self.service_url, "images", filename) @dataclass class ServerConfigModels: @@ -203,6 +125,8 @@ def create_app_fastapi(config: ServerConfigModels) -> FastAPI: async def lifespan(app: FastAPI): logging.basicConfig(level=logging.INFO) app.state.logger = logging.getLogger("diffusers-server") + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True' + os.environ['CUDA_LAUNCH_BLOCKING'] = '0' app.state.total_requests = 0 app.state.active_inferences = 0 @@ -227,8 +151,6 @@ async def metrics_loop(): raise app.state.metrics_task = asyncio.create_task(metrics_loop()) - from concurrent.futures import ThreadPoolExecutor - app.state.SAVE_EXECUTOR = ThreadPoolExecutor(max_workers=1) try: yield @@ -314,74 +236,45 @@ async def api(json: JSONBodyQueryAPI): wrapper = app.state.MODEL_PIPELINE initializer = app.state.MODEL_INITIALIZER - utils_app = app.state.utils_app - req_pipe = app.state.REQUEST_PIPE + + utils_app = app.state.utils_app + if not wrapper or not wrapper.pipeline: raise HTTPException(500, "Model not initialized correctly") if not prompt.strip(): raise HTTPException(400, "No prompt provided") + def make_generator(): g = torch.Generator(device=initializer.device) return g.manual_seed(random.randint(0, 10_000_000)) + req_pipe = app.state.REQUEST_PIPE + def infer(): gen = make_generator() - - # Maybe this will improve some performance (I'll test it) - with torch.no_grad(): - output = req_pipe.generate( - prompt=prompt, - negative_prompt=negative_prompt, - generator=gen, - num_inference_steps=num_steps, - num_images_per_prompt=num_images_per_prompt, - device=initializer.device - ) - - return output + return req_pipe.generate( + prompt=prompt, + negative_prompt=negative_prompt, + generator=gen, + num_inference_steps=num_steps, + num_images_per_prompt=num_images_per_prompt, + device=initializer.device, + output_type="pil", + ) try: async with app.state.metrics_lock: app.state.active_inferences += 1 output = await run_in_threadpool(infer) - - images = getattr(output, "images", []) or [] - - saved_urls = [] - - for i, img in enumerate(images): - try: - url = await utils_app.save_image(img) - saved_urls.append(url) - - if isinstance(img, Image.Image): - img.close() - del img - - if torch.cuda.is_available(): - torch.cuda.synchronize() - - except Exception as e: - logger.error(f"Error saving image {i}: {e}") - continue - - - del output, images - - if torch.cuda.is_available(): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - gc.collect() - async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) - - return {"response": saved_urls} + + urls = [utils_app.save_image(img) for img in output.images] + return {"response": urls} except Exception as e: async with app.state.metrics_lock: @@ -393,6 +286,8 @@ def infer(): if torch.cuda.is_available(): torch.cuda.synchronize() torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + torch.cuda.ipc_collect() gc.collect() diff --git a/examples/server-async/DiffusersServer/superpipeline.py b/examples/server-async/DiffusersServer/superpipeline.py deleted file mode 100644 index 4e2bb9452c4a..000000000000 --- a/examples/server-async/DiffusersServer/superpipeline.py +++ /dev/null @@ -1,50 +0,0 @@ -from diffusers.pipelines import * -from diffusers import * -import torch -from typing import Optional, Dict, Any, Type -import logging - -logger = logging.getLogger(__name__) - -class SuperPipelinesT2Img: - def __init__(self, model_path: str, - pipeline: Type, - torch_dtype = torch.bfloat16, - components: Optional[Dict[str, Any]] = None,): - """ - Clase para crear tus Pipelines personalizados para tu API custom - Args: - model_path: Ruta o nombre del modelo - pipeline: Clase del pipeline a utilizar - torch_dtype: Tipo de datos de PyTorch a utilizar - components: Diccionario de componentes personalizados - """ - self.model_path = model_path - self.pipeline = pipeline - self.torch_dtype = torch_dtype - self.components = components or {} - self.device: str = None - - def start(self): - if torch.cuda.is_available(): - logger.info("Loading CUDA") - model_path = self.model_path - self.device = 'cuda' - self.pipeline = self.pipeline.from_pretrained( - model_path, - torch_dtype = self.torch_dtype, - ** self.components - ).to(device=self.device) - elif torch.backends.mps.is_available(): - logger.info("Loading MPS for Mac M Series") - model_path = self.model_path - self.device = 'mps' - self.pipeline = self.pipeline.from_pretrained( - model_path, - torch_dtype = self.torch_dtype, - **self.components - ).to(device=self.device) - else: - raise Exception("No CUDA or MPS device available") - - return self \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py index faefc5c2f0ee..c2688e25497d 100644 --- a/examples/server-async/DiffusersServer/uvicorn_diffu.py +++ b/examples/server-async/DiffusersServer/uvicorn_diffu.py @@ -5,9 +5,7 @@ import os import threading import time -import string -# Configuración de logging def setup_logging(): logging.basicConfig(level=logging.INFO) return logging.getLogger('uvicorn') @@ -15,25 +13,17 @@ def setup_logging(): logger = setup_logging() def memory_cleanup(interval=30): - """ - Función para monitorear y limpiar la memoria periódicamente - - Args: - interval (int): Intervalo en segundos entre limpiezas - """ while True: try: - # Forzar recolección de basura gc.collect() - # Obtener información de memoria actual process = psutil.Process(os.getpid()) mem = process.memory_info().rss / 1024 / 1024 - logger.info(f"Memoria en uso: {mem:.2f} MB") + logger.info(f"Memory in use: {mem:.2f} MB") time.sleep(interval) except Exception as e: - logger.error(f"Error en limpieza de memoria: {str(e)}") + logger.error(f"Memory clearing error: {str(e)}") time.sleep(interval) def run_uvicorn_server( @@ -48,27 +38,9 @@ def run_uvicorn_server( ], enable_memory_monitor=True ): - """ - Ejecuta un servidor de FastAPI utilizando Uvicorn con monitoreo de memoria opcional - - Args: - app: Aplicación FastAPI - host (str): Host donde se servirá la aplicación - port (int): Puerto para el servidor - workers (int): Número de hilos para Uvicorn - cleanup_interval (int): Intervalo de limpieza para Uvicorn - channel_timeout (int): Tiempo de espera máximo para canales - server_header (bool): Activar el identificador / Header del servidor - headers (str): Identificador del servidor / Header del servidor - enable_memory_monitor (bool): Si se debe activar el monitoreo de memoria - - Returns: - El resultado de serve() (aunque normalmente no retorna) - """ gc.enable() gc.set_threshold(700, 10, 5) - # Iniciar monitoreo de memoria si está habilitado if enable_memory_monitor: cleanup_thread = threading.Thread( target=memory_cleanup, @@ -76,9 +48,9 @@ def run_uvicorn_server( daemon=True ) cleanup_thread.start() - logger.info("Monitor de memoria activado") + logger.info("Memory monitor activated") - logger.info(f"Iniciando servidor Uvicorn en {host}:{port}...") + logger.info(f"Starting Uvicorn server in {host}:{port}...") config = uvicorn.Config( app=app, From 0f63f4d4362a769e082a37e0a8e344eb427985ab Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 13 Sep 2025 22:06:24 -0600 Subject: [PATCH 18/34] Update examples/server-async/README.md --- examples/server-async/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index 10b4c1825098..43b86d52442d 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -1,6 +1,7 @@ # Asynchronous server and parallel execution of models > Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork. +> We recommend running about 10 to 50 inferences in parallel to have a good performance of 25-30s to 1-1:30min on average ## ⚠️ IMPORTANT From a9666b11fca95b24318893054bd384c37cc3126d Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sat, 13 Sep 2025 22:16:42 -0600 Subject: [PATCH 19/34] Update examples/server-async/README.md for changes to tokenizer locks and backward-compatible retrieve_timesteps --- examples/server-async/README.md | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index 43b86d52442d..edf07852c247 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -14,16 +14,14 @@ All the components needed to create the inference server are in `DiffusersServer/` ``` -DiffusersServer/ # the example server package -├── __init__.py +DiffusersServer/ +├── **init**.py ├── create_server.py # helper script to build/run the app programmatically ├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) -├── serverasync.py # FastAPI app factory (create_app_fastapi) -├── superpipeline.py # optional custom pipeline glue code +├── serverasync.py # FastAPI app factory (create\_app\_fastapi) ├── uvicorn_diffu.py # convenience script to start uvicorn with recommended flags ``` - ## What `diffusers-async` adds / Why we needed it Core problem: a naive server that calls `pipe.__call__` concurrently can hit **race conditions** (e.g., `scheduler.set_timesteps` mutates shared state) or explode memory by deep-copying the whole pipeline per-request. @@ -32,7 +30,8 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r * **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*. * **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics. -* **`retrieve_timesteps(..., return_scheduler=True)`**: retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. This is the safe path for getting a scheduler configured per-request. +* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur. +* **`retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API. * **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. ## How the server works (high-level flow) @@ -51,7 +50,6 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r 3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`). 4. Multiple requests can run in parallel while sharing heavy weights and isolating mutable state. - ## How to set up and run the server ### 1) Install dependencies @@ -65,7 +63,7 @@ If using the `diffusers` fork via git, either: ```bash pip install "git+https://github.com/F4k3r22/diffusers-async.git@main" pip install -r requirements.txt -``` +```` ### 2) Start the server @@ -97,17 +95,14 @@ Response example: ## Troubleshooting (quick) -* `Already borrowed` — tokenizers (Rust) error when used concurrently. +* `Already borrowed` — previously a Rust tokenizer concurrency error. + ✅ This is now fixed: `RequestScopedPipeline` manages an internal tokenizer lock so race conditions no longer happen. - * Workarounds: - - * Acquire a `Lock` around tokenization or around the pipeline call (serializes that part). - * Use the slow tokenizer (`converter_to_slow`) for concurrency tests. - * Patch only the tokenization method to use a lock instead of serializing entire forward. * `can't set attribute 'components'` — pipeline exposes read-only `components`. * The RequestScopedPipeline now detects read-only properties and skips setting them. + * Scheduler issues: * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler. - + * ✅ Note: `retrieve_timesteps` is fully retro-compatible — if you don’t pass `return_scheduler=True`, the behavior is unchanged. From 06bb13644174125eb79f83c7692e6e034d5e737a Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 14 Sep 2025 18:39:06 -0600 Subject: [PATCH 20/34] The changes to the diffusers core have been undone and all logic is being moved to exmaples/server-async --- .../DiffusersServer/serverasync.py | 5 +- examples/server-async/utils/__init__.py | 1 + .../utils/requestscopedpipeline.py | 266 +++++++++++++++++ examples/server-async/utils/scheduler.py | 118 ++++++++ src/diffusers/pipelines/flux/pipeline_flux.py | 80 ++--- src/diffusers/pipelines/pipeline_utils.py | 276 +----------------- .../pipeline_stable_diffusion.py | 81 ++--- .../pipeline_stable_diffusion_3.py | 81 ++--- .../pipeline_stable_diffusion_xl.py | 81 ++--- .../pipeline_stable_diffusion_adapter.py | 80 ++--- .../pipeline_stable_diffusion_xl_adapter.py | 80 ++--- src/diffusers/schedulers/scheduling_amused.py | 5 - .../scheduling_consistency_decoder.py | 6 +- .../scheduling_consistency_models.py | 5 - .../scheduling_cosine_dpmsolver_multistep.py | 5 - src/diffusers/schedulers/scheduling_ddim.py | 5 - .../schedulers/scheduling_ddim_cogvideox.py | 5 - .../schedulers/scheduling_ddim_inverse.py | 5 - .../schedulers/scheduling_ddim_parallel.py | 6 - src/diffusers/schedulers/scheduling_ddpm.py | 5 - .../schedulers/scheduling_ddpm_parallel.py | 5 - .../schedulers/scheduling_ddpm_wuerstchen.py | 5 - .../schedulers/scheduling_deis_multistep.py | 5 - .../schedulers/scheduling_dpm_cogvideox.py | 5 - .../scheduling_dpmsolver_multistep.py | 6 - .../scheduling_dpmsolver_multistep_inverse.py | 5 - .../schedulers/scheduling_dpmsolver_sde.py | 5 - .../scheduling_dpmsolver_singlestep.py | 5 - .../scheduling_edm_dpmsolver_multistep.py | 5 - .../schedulers/scheduling_edm_euler.py | 5 - .../scheduling_euler_ancestral_discrete.py | 5 - .../schedulers/scheduling_euler_discrete.py | 5 - .../scheduling_flow_match_euler_discrete.py | 5 - src/diffusers/schedulers/scheduling_sde_ve.py | 4 - src/diffusers/schedulers/scheduling_tcd.py | 5 - src/diffusers/schedulers/scheduling_unclip.py | 5 - .../schedulers/scheduling_unipc_multistep.py | 5 - .../schedulers/scheduling_vq_diffusion.py | 5 - 38 files changed, 498 insertions(+), 788 deletions(-) create mode 100644 examples/server-async/utils/__init__.py create mode 100644 examples/server-async/utils/requestscopedpipeline.py create mode 100644 examples/server-async/utils/scheduler.py diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index ff0e64080d81..61eb99c3fdce 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -3,11 +3,10 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel -from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD +from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger import logging -from diffusers.pipelines.pipeline_utils import RequestScopedPipeline +from ..utils import RequestScopedPipeline from diffusers import * -from .superpipeline import * import random import uuid import tempfile diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py new file mode 100644 index 000000000000..38b01f7aa59d --- /dev/null +++ b/examples/server-async/utils/__init__.py @@ -0,0 +1 @@ +from .requestscopedpipeline import RequestScopedPipeline \ No newline at end of file diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py new file mode 100644 index 000000000000..56f5626ed156 --- /dev/null +++ b/examples/server-async/utils/requestscopedpipeline.py @@ -0,0 +1,266 @@ +from typing import Optional, Any, Iterable, List +import copy +import threading +import torch +from diffusers.utils import logging + +logger = logging.get_logger(__name__) + +def safe_tokenize(tokenizer, *args, lock, **kwargs): + with lock: + return tokenizer(*args, **kwargs) + +class RequestScopedPipeline: + DEFAULT_MUTABLE_ATTRS = [ + "_all_hooks", + "_offload_device", + "_progress_bar_config", + "_progress_bar", + "_rng_state", + "_last_seed", + "latents", + ] + + def __init__( + self, + pipeline: Any, + mutable_attrs: Optional[Iterable[str]] = None, + auto_detect_mutables: bool = True, + tensor_numel_threshold: int = 1_000_000, + tokenizer_lock: Optional[threading.Lock] = None + ): + self._base = pipeline + self.unet = getattr(pipeline, "unet", None) + self.vae = getattr(pipeline, "vae", None) + self.text_encoder = getattr(pipeline, "text_encoder", None) + self.components = getattr(pipeline, "components", None) + + self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) + self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() + + self._auto_detect_mutables = bool(auto_detect_mutables) + self._tensor_numel_threshold = int(tensor_numel_threshold) + + self._auto_detected_attrs: List[str] = [] + + def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs): + base_sched = getattr(self._base, "scheduler", None) + if base_sched is None: + return None + + if hasattr(base_sched, "clone_for_request"): + try: + return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs) + except Exception as e: + logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") + + try: + return copy.deepcopy(base_sched) + except Exception as e: + logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") + return base_sched + + def _autodetect_mutables(self, max_attrs: int = 40): + if not self._auto_detect_mutables: + return [] + + if self._auto_detected_attrs: + return self._auto_detected_attrs + + candidates: List[str] = [] + seen = set() + for name in dir(self._base): + if name.startswith("__"): + continue + if name in self._mutable_attrs: + continue + if name in ("to", "save_pretrained", "from_pretrained"): + continue + try: + val = getattr(self._base, name) + except Exception: + continue + + import types + + # skip callables and modules + if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)): + continue + + # containers -> candidate + if isinstance(val, (dict, list, set, tuple, bytearray)): + candidates.append(name) + seen.add(name) + else: + # try Tensor detection + try: + if isinstance(val, torch.Tensor): + if val.numel() <= self._tensor_numel_threshold: + candidates.append(name) + seen.add(name) + else: + logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}") + except Exception: + continue + + if len(candidates) >= max_attrs: + break + + self._auto_detected_attrs = candidates + logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}") + return self._auto_detected_attrs + + def _is_readonly_property(self, base_obj, attr_name: str) -> bool: + try: + cls = type(base_obj) + descriptor = getattr(cls, attr_name, None) + if isinstance(descriptor, property): + return descriptor.fset is None + if hasattr(descriptor, "__set__") is False and descriptor is not None: + return False + except Exception: + pass + return False + + def _clone_mutable_attrs(self, base, local): + attrs_to_clone = list(self._mutable_attrs) + attrs_to_clone.extend(self._autodetect_mutables()) + + EXCLUDE_ATTRS = {"components",} + + for attr in attrs_to_clone: + if attr in EXCLUDE_ATTRS: + logger.debug(f"Skipping excluded attr '{attr}'") + continue + if not hasattr(base, attr): + continue + if self._is_readonly_property(base, attr): + logger.debug(f"Skipping read-only property '{attr}'") + continue + + try: + val = getattr(base, attr) + except Exception as e: + logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}") + continue + + try: + if isinstance(val, dict): + setattr(local, attr, dict(val)) + elif isinstance(val, (list, tuple, set)): + setattr(local, attr, list(val)) + elif isinstance(val, bytearray): + setattr(local, attr, bytearray(val)) + else: + # small tensors or atomic values + if isinstance(val, torch.Tensor): + if val.numel() <= self._tensor_numel_threshold: + setattr(local, attr, val.clone()) + else: + # don't clone big tensors, keep reference + setattr(local, attr, val) + else: + try: + setattr(local, attr, copy.copy(val)) + except Exception: + setattr(local, attr, val) + except (AttributeError, TypeError) as e: + logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}") + continue + except Exception as e: + logger.debug(f"Unexpected error cloning attribute '{attr}': {e}") + continue + + def _is_tokenizer_component(self, component) -> bool: + if component is None: + return False + + tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] + has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) + + class_name = component.__class__.__name__.lower() + has_tokenizer_in_name = 'tokenizer' in class_name + + tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] + has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) + + return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) + + def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): + local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device) + + try: + local_pipe = copy.copy(self._base) + except Exception as e: + logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).") + local_pipe = copy.deepcopy(self._base) + + if local_scheduler is not None: + try: + setattr(local_pipe, "scheduler", local_scheduler) + except Exception: + logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") + + self._clone_mutable_attrs(self._base, local_pipe) + + # 4) wrap tokenizers on the local pipe with the lock wrapper + tokenizer_wrappers = {} # name -> original_tokenizer + try: + # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...) + for name in dir(local_pipe): + if "tokenizer" in name and not name.startswith("_"): + tok = getattr(local_pipe, name, None) + if tok is not None and self._is_tokenizer_component(tok): + tokenizer_wrappers[name] = tok + setattr( + local_pipe, + name, + lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) + ) + + # b) wrap tokenizers in components dict + if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): + for key, val in local_pipe.components.items(): + if val is None: + continue + + if self._is_tokenizer_component(val): + tokenizer_wrappers[f"components[{key}]"] = val + local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize( + tokenizer, *args, lock=self._tokenizer_lock, **kwargs + ) + + except Exception as e: + logger.debug(f"Tokenizer wrapping step encountered an error: {e}") + + result = None + cm = getattr(local_pipe, "model_cpu_offload_context", None) + try: + if callable(cm): + try: + with cm(): + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except TypeError: + # cm might be a context manager instance rather than callable + try: + with cm: + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + except Exception as e: + logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + else: + # no offload context available — call directly + result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) + + return result + + finally: + try: + for name, tok in tokenizer_wrappers.items(): + if name.startswith("components["): + key = name[len("components["):-1] + local_pipe.components[key] = tok + else: + setattr(local_pipe, name, tok) + except Exception as e: + logger.debug(f"Error restoring wrapped tokenizers: {e}") \ No newline at end of file diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/utils/scheduler.py new file mode 100644 index 000000000000..a20715e254cd --- /dev/null +++ b/examples/server-async/utils/scheduler.py @@ -0,0 +1,118 @@ +from typing import Any, Optional, Union, List +import torch +import copy +import inspect + +class BaseAsyncScheduler: + def __init__(self, scheduler: Any): + pass + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): + # I leave it as an example of what the Scheduler should do to implement it later + """local = copy.deepcopy(self) + local.set_timesteps(num_inference_steps=num_inference_steps, device=device) + return local""" + pass + + +def async_retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. + Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Backwards compatible: by default the function behaves exactly as before and returns + (timesteps_tensor, num_inference_steps) + + If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed + scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) + or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: + (timesteps_tensor, num_inference_steps, scheduler_in_use) + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Optional kwargs: + return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) + where `scheduler_in_use` is a scheduler instance that already has timesteps set. + This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. + + Returns: + `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or + `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + """ + # pop our optional control kwarg (keeps compatibility) + return_scheduler = bool(kwargs.pop("return_scheduler", False)) + + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + + # choose scheduler to call set_timesteps on + scheduler_in_use = scheduler + if return_scheduler: + # Do not mutate the provided scheduler: prefer to clone if possible + if hasattr(scheduler, "clone_for_request"): + try: + # clone_for_request may accept num_inference_steps or other kwargs; be permissive + scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + except Exception: + scheduler_in_use = copy.deepcopy(scheduler) + else: + # fallback deepcopy (scheduler tends to be smallish - acceptable) + scheduler_in_use = copy.deepcopy(scheduler) + + # helper to test if set_timesteps supports a particular kwarg + def _accepts(param_name: str) -> bool: + try: + return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) + except (ValueError, TypeError): + # if signature introspection fails, be permissive and attempt the call later + return False + + # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) + if timesteps is not None: + accepts_timesteps = _accepts("timesteps") + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) + elif sigmas is not None: + accept_sigmas = _accepts("sigmas") + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + num_inference_steps = len(timesteps_out) + else: + # default path + scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps_out = scheduler_in_use.timesteps + + if return_scheduler: + return timesteps_out, num_inference_steps, scheduler_in_use + return timesteps_out, num_inference_steps \ No newline at end of file diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py index 1ae0156c71d6..42d20472bf0b 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux.py @@ -92,18 +92,10 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -) : +): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -120,72 +112,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps class FluxPipeline( diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index de6200f30c84..d311b5b6df20 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -21,9 +21,8 @@ import sys from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin, Iterable -import copy -import threading +from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin + import numpy as np import PIL.Image @@ -71,8 +70,6 @@ ) from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card from ..utils.torch_utils import empty_device_cache, get_device, is_compiled_module -import copy -from types import SimpleNamespace if is_torch_npu_available(): @@ -182,275 +179,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) -def safe_tokenize(tokenizer, *args, lock, **kwargs): - with lock: - return tokenizer(*args, **kwargs) - - -class RequestScopedPipeline: - DEFAULT_MUTABLE_ATTRS = [ - "_all_hooks", - "_offload_device", - "_progress_bar_config", - "_progress_bar", - "_rng_state", - "_last_seed", - "latents", - ] - - def __init__( - self, - pipeline: Any, - mutable_attrs: Optional[Iterable[str]] = None, - auto_detect_mutables: bool = True, - tensor_numel_threshold: int = 1_000_000, - tokenizer_lock: Optional[threading.Lock] = None - ): - self._base = pipeline - self.unet = getattr(pipeline, "unet", None) - self.vae = getattr(pipeline, "vae", None) - self.text_encoder = getattr(pipeline, "text_encoder", None) - self.components = getattr(pipeline, "components", None) - - self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) - self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() - - self._auto_detect_mutables = bool(auto_detect_mutables) - self._tensor_numel_threshold = int(tensor_numel_threshold) - - self._auto_detected_attrs: List[str] = [] - - def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs): - base_sched = getattr(self._base, "scheduler", None) - if base_sched is None: - return None - - if hasattr(base_sched, "clone_for_request"): - try: - return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs) - except Exception as e: - logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") - - try: - return copy.deepcopy(base_sched) - except Exception as e: - logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") - return base_sched - - def _autodetect_mutables(self, max_attrs: int = 40): - if not self._auto_detect_mutables: - return [] - - if self._auto_detected_attrs: - return self._auto_detected_attrs - - candidates: List[str] = [] - seen = set() - for name in dir(self._base): - if name.startswith("__"): - continue - if name in self._mutable_attrs: - continue - if name in ("to", "save_pretrained", "from_pretrained"): - continue - try: - val = getattr(self._base, name) - except Exception: - continue - - import types - - # skip callables and modules - if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)): - continue - - # containers -> candidate - if isinstance(val, (dict, list, set, tuple, bytearray)): - candidates.append(name) - seen.add(name) - else: - # try Tensor detection - try: - if isinstance(val, torch.Tensor): - if val.numel() <= self._tensor_numel_threshold: - candidates.append(name) - seen.add(name) - else: - logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}") - except Exception: - continue - - if len(candidates) >= max_attrs: - break - - self._auto_detected_attrs = candidates - logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}") - return self._auto_detected_attrs - - def _is_readonly_property(self, base_obj, attr_name: str) -> bool: - try: - cls = type(base_obj) - descriptor = getattr(cls, attr_name, None) - if isinstance(descriptor, property): - return descriptor.fset is None - if hasattr(descriptor, "__set__") is False and descriptor is not None: - return False - except Exception: - pass - return False - - def _clone_mutable_attrs(self, base, local): - attrs_to_clone = list(self._mutable_attrs) - attrs_to_clone.extend(self._autodetect_mutables()) - - EXCLUDE_ATTRS = {"components",} - - for attr in attrs_to_clone: - if attr in EXCLUDE_ATTRS: - logger.debug(f"Skipping excluded attr '{attr}'") - continue - if not hasattr(base, attr): - continue - if self._is_readonly_property(base, attr): - logger.debug(f"Skipping read-only property '{attr}'") - continue - - try: - val = getattr(base, attr) - except Exception as e: - logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}") - continue - - try: - if isinstance(val, dict): - setattr(local, attr, dict(val)) - elif isinstance(val, (list, tuple, set)): - setattr(local, attr, list(val)) - elif isinstance(val, bytearray): - setattr(local, attr, bytearray(val)) - else: - # small tensors or atomic values - if isinstance(val, torch.Tensor): - if val.numel() <= self._tensor_numel_threshold: - setattr(local, attr, val.clone()) - else: - # don't clone big tensors, keep reference - setattr(local, attr, val) - else: - try: - setattr(local, attr, copy.copy(val)) - except Exception: - # último recurso: asignar referencia - setattr(local, attr, val) - except (AttributeError, TypeError) as e: - logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}") - # continuar sin fallar - continue - except Exception as e: - logger.debug(f"Unexpected error cloning attribute '{attr}': {e}") - continue - - def _is_tokenizer_component(self, component) -> bool: - """Determina si un componente es un tokenizador basándose en métodos y atributos comunes.""" - if component is None: - return False - - # Verificar métodos comunes de tokenizadores - tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] - has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) - - # Verificar nombre de clase - class_name = component.__class__.__name__.lower() - has_tokenizer_in_name = 'tokenizer' in class_name - - # Verificar atributos comunes de tokenizadores - tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] - has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) - - return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) - - def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): - local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device) - - try: - local_pipe = copy.copy(self._base) - except Exception as e: - logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).") - local_pipe = copy.deepcopy(self._base) - - if local_scheduler is not None: - try: - setattr(local_pipe, "scheduler", local_scheduler) - except Exception: - logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") - - self._clone_mutable_attrs(self._base, local_pipe) - - # 4) wrap tokenizers on the local pipe with the lock wrapper - tokenizer_wrappers = {} # name -> original_tokenizer - try: - # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...) - for name in dir(local_pipe): - if "tokenizer" in name and not name.startswith("_"): - tok = getattr(local_pipe, name, None) - if tok is not None and self._is_tokenizer_component(tok): - tokenizer_wrappers[name] = tok - setattr( - local_pipe, - name, - lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) - ) - - # b) wrap tokenizers in components dict - CORRECCIÓN CRÍTICA - if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): - for key, val in local_pipe.components.items(): - if val is None: - continue - - # Solo envolver si realmente ES un tokenizador - if self._is_tokenizer_component(val): - tokenizer_wrappers[f"components[{key}]"] = val - # Crear una nueva función lambda que capture correctamente 'val' - local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize( - tokenizer, *args, lock=self._tokenizer_lock, **kwargs - ) - - except Exception as e: - logger.debug(f"Tokenizer wrapping step encountered an error: {e}") - - result = None - cm = getattr(local_pipe, "model_cpu_offload_context", None) - try: - if callable(cm): - try: - with cm(): - result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - except TypeError: - # cm might be a context manager instance rather than callable - try: - with cm: - result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - except Exception as e: - logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") - result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - else: - # no offload context available — call directly - result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) - - return result - - finally: - try: - # Restaurar los tokenizadores originales - for name, tok in tokenizer_wrappers.items(): - if name.startswith("components["): - key = name[len("components["):-1] - local_pipe.components[key] = tok - else: - setattr(local_pipe, name, tok) - except Exception as e: - logger.debug(f"Error restoring wrapped tokenizers: {e}") - class DiffusionPipeline(ConfigMixin, PushToHubMixin): r""" diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index ebc87f30a7f3..8023b4e77dc8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union, Tuple -import copy +from typing import Any, Callable, Dict, List, Optional, Union import torch from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection @@ -101,16 +100,8 @@ def retrieve_timesteps( **kwargs, ): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -127,72 +118,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index 0ee5ad4bc949..4c3975dca2a4 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -13,8 +13,7 @@ # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union, Tuple -import copy +from typing import Any, Callable, Dict, List, Optional, Union import torch from transformers import ( CLIPTextModelWithProjection, @@ -95,16 +94,8 @@ def retrieve_timesteps( **kwargs, ): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -121,72 +112,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin): diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 81f1580fce4a..b97cf6f1f6f8 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -120,18 +120,10 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -) : +): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -148,73 +140,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - import copy - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps class StableDiffusionXLPipeline( diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 63f40497afff..1ce6987114a7 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -136,18 +136,10 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -) : +): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -164,72 +156,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin): diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 74a1a0bb1b22..2802d690f3cc 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -161,18 +161,10 @@ def retrieve_timesteps( timesteps: Optional[List[int]] = None, sigmas: Optional[List[float]] = None, **kwargs, -) : +): r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. - Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Backwards compatible: by default the function behaves exactly as before and returns - (timesteps_tensor, num_inference_steps) - - If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed - scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`) - or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return: - (timesteps_tensor, num_inference_steps, scheduler_in_use) + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. Args: scheduler (`SchedulerMixin`): @@ -189,72 +181,36 @@ def retrieve_timesteps( Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - Optional kwargs: - return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use) - where `scheduler_in_use` is a scheduler instance that already has timesteps set. - This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler. - Returns: - `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or - `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`. + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. """ - # pop our optional control kwarg (keeps compatibility) - return_scheduler = bool(kwargs.pop("return_scheduler", False)) - if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - - # choose scheduler to call set_timesteps on - scheduler_in_use = scheduler - if return_scheduler: - # Do not mutate the provided scheduler: prefer to clone if possible - if hasattr(scheduler, "clone_for_request"): - try: - # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) - except Exception: - scheduler_in_use = copy.deepcopy(scheduler) - else: - # fallback deepcopy (scheduler tends to be smallish - acceptable) - scheduler_in_use = copy.deepcopy(scheduler) - - # helper to test if set_timesteps supports a particular kwarg - def _accepts(param_name: str) -> bool: - try: - return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys()) - except (ValueError, TypeError): - # if signature introspection fails, be permissive and attempt the call later - return False - - # now call set_timesteps on the chosen scheduler_in_use (may be original or clone) if timesteps is not None: - accepts_timesteps = _accepts("timesteps") + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" timestep schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) elif sigmas is not None: - accept_sigmas = _accepts("sigmas") + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accept_sigmas: raise ValueError( - f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom" + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" f" sigmas schedules. Please check whether you are using the correct scheduler." ) - scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - num_inference_steps = len(timesteps_out) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: - # default path - scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps_out = scheduler_in_use.timesteps - - if return_scheduler: - return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps class StableDiffusionXLAdapterPipeline( diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index ee767380e2f7..c4b336811cf4 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -7,7 +7,6 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput from .scheduling_utils import SchedulerMixin -import copy def gumbel_noise(t, generator=None): @@ -162,7 +161,3 @@ def add_noise(self, sample, timesteps, generator=None): return masked_sample - def clone_for_request(self, num_inference_steps: int, temperature=(2, 0), device: Union[str, torch.device] = None): - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, temperature=temperature, device=device) - return local diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index 7bf3ec6f4aeb..acb24ea04d84 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -8,7 +8,6 @@ from ..utils import BaseOutput from ..utils.torch_utils import randn_tensor from .scheduling_utils import SchedulerMixin -import copy # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -110,10 +109,7 @@ def set_timesteps( self.c_out = self.c_out.to(device) self.c_in = self.c_in.to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local + @property def init_noise_sigma(self): diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 271369777301..56145cebcf6f 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -243,11 +243,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local # Modified _convert_to_karras implementation that takes in ramp as argument def _convert_to_karras(self, ramp): diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py index ecda598b8ce3..0752435240c3 100644 --- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py @@ -241,11 +241,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # if a noise sampler is used, reinitialise it self.noise_sampler = None - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 9dc1006ee2a1..cd66070b69b6 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -339,11 +339,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py index 3e91077b7e50..efc04dd5023f 100644 --- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py @@ -302,11 +302,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index fba349c8fc9f..0ccf15828cee 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -286,11 +286,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 49107c9bca17..e61fe866a1ae 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -362,12 +362,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index be6d7ad4880d..7cc0c4cef1f1 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -322,11 +322,6 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 571aaf52bccc..4d48b7c307fb 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -332,11 +332,6 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance def _get_variance(self, t, predicted_variance=None, variance_type=None): diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 126956204880..61143179329a 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -161,11 +161,6 @@ def set_timesteps( timesteps = torch.Tensor(timesteps).to(device) self.timesteps = timesteps - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index 13adec66870c..e6581924e07d 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -318,11 +318,6 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py index 6de6d07f11c8..b6398399763c 100644 --- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py @@ -303,11 +303,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None): lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log() diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 407215937fa6..d07ff8b2007b 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -457,12 +457,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index fd886b48eb22..06ff3c6c573a 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -330,11 +330,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 9bba69be9e49..9777a9ff54ee 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -412,11 +412,6 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication self.noise_sampler = None - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def _second_order_timesteps(self, sigmas, log_sigmas): def sigma_fn(_t): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 9d0bebe13d99..9cb72d021447 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -407,11 +407,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index 105603e01f8d..bff9b267a058 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -273,11 +273,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index 20d3be9756dc..c5e3d8145b0e 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -261,11 +261,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 5713ffcfdee0..e9cb3107bbe9 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -318,11 +318,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep def index_for_timestep(self, timestep, schedule_timesteps=None): diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index fee2d03e5291..513ef662820e 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -449,11 +449,6 @@ def set_timesteps( self._begin_index = None self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local def _sigma_to_t(self, sigma, log_sigmas): # get log sigma diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py index 258e8252f557..da4b69957097 100644 --- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py @@ -348,11 +348,6 @@ def set_timesteps( self._step_index = None self._begin_index = None - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index d31c6a9430cb..922a03a7fd34 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -123,10 +123,6 @@ def set_timesteps( self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device) - def clone_for_request(self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None): - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, sampling_eps=sampling_eps, device=device) - return local def set_sigmas( self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 01a47bbd52a5..06063ddd3bfc 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -521,11 +521,6 @@ def set_timesteps( self._step_index = None self._begin_index = None - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local def step( self, diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index 4b07949ac30f..b825102dfda9 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -177,11 +177,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = torch.from_numpy(timesteps).to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None): if prev_timestep is None: diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index b0bc1d1a8b16..38354555e9f3 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -430,11 +430,6 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None, timesteps: Optional[List[int]] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, timesteps=timesteps) - return local # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 7ab4f151de65..5369901b7656 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -197,11 +197,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.log_cumprod_bt = self.log_cumprod_bt.to(device) self.log_cumprod_ct = self.log_cumprod_ct.to(device) - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - import copy - local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local def step( self, From a519915a226ae6c717b34c3d92542b9188dc0d77 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 14 Sep 2025 20:29:50 -0600 Subject: [PATCH 21/34] Update examples/server-async/utils/* --- .../DiffusersServer/serverasync.py | 2 +- .../{ => DiffusersServer}/utils/__init__.py | 0 .../utils/requestscopedpipeline.py | 43 ++++++++++++++----- .../{ => DiffusersServer}/utils/scheduler.py | 18 ++++---- 4 files changed, 44 insertions(+), 19 deletions(-) rename examples/server-async/{ => DiffusersServer}/utils/__init__.py (100%) rename examples/server-async/{ => DiffusersServer}/utils/requestscopedpipeline.py (86%) rename examples/server-async/{ => DiffusersServer}/utils/scheduler.py (95%) diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py index 61eb99c3fdce..d345db595838 100644 --- a/examples/server-async/DiffusersServer/serverasync.py +++ b/examples/server-async/DiffusersServer/serverasync.py @@ -5,7 +5,7 @@ from pydantic import BaseModel from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger import logging -from ..utils import RequestScopedPipeline +from .utils import RequestScopedPipeline from diffusers import * import random import uuid diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/DiffusersServer/utils/__init__.py similarity index 100% rename from examples/server-async/utils/__init__.py rename to examples/server-async/DiffusersServer/utils/__init__.py diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py similarity index 86% rename from examples/server-async/utils/requestscopedpipeline.py rename to examples/server-async/DiffusersServer/utils/requestscopedpipeline.py index 56f5626ed156..79f79e28f5e7 100644 --- a/examples/server-async/utils/requestscopedpipeline.py +++ b/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py @@ -3,6 +3,8 @@ import threading import torch from diffusers.utils import logging +from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps + logger = logging.get_logger(__name__) @@ -27,7 +29,8 @@ def __init__( mutable_attrs: Optional[Iterable[str]] = None, auto_detect_mutables: bool = True, tensor_numel_threshold: int = 1_000_000, - tokenizer_lock: Optional[threading.Lock] = None + tokenizer_lock: Optional[threading.Lock] = None, + wrap_scheduler: bool = True ): self._base = pipeline self.unet = getattr(pipeline, "unet", None) @@ -35,6 +38,10 @@ def __init__( self.text_encoder = getattr(pipeline, "text_encoder", None) self.components = getattr(pipeline, "components", None) + if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None: + if not isinstance(pipeline.scheduler, BaseAsyncScheduler): + pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler) + self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() @@ -48,17 +55,24 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] if base_sched is None: return None - if hasattr(base_sched, "clone_for_request"): - try: - return base_sched.clone_for_request(num_inference_steps=num_inference_steps, device=device, **clone_kwargs) - except Exception as e: - logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") + if not isinstance(base_sched, BaseAsyncScheduler): + wrapped_scheduler = BaseAsyncScheduler(base_sched) + else: + wrapped_scheduler = base_sched try: - return copy.deepcopy(base_sched) + return wrapped_scheduler.clone_for_request( + num_inference_steps=num_inference_steps, + device=device, + **clone_kwargs + ) except Exception as e: - logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") - return base_sched + logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") + try: + return copy.deepcopy(wrapped_scheduler) + except Exception as e: + logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") + return wrapped_scheduler def _autodetect_mutables(self, max_attrs: int = 40): if not self._auto_detect_mutables: @@ -197,7 +211,16 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = if local_scheduler is not None: try: - setattr(local_pipe, "scheduler", local_scheduler) + timesteps, num_steps, configured_scheduler = async_retrieve_timesteps( + local_scheduler.scheduler, + num_inference_steps=num_inference_steps, + device=device, + return_scheduler=True, + **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']} + ) + + final_scheduler = BaseAsyncScheduler(configured_scheduler) + setattr(local_pipe, "scheduler", final_scheduler) except Exception: logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/DiffusersServer/utils/scheduler.py similarity index 95% rename from examples/server-async/utils/scheduler.py rename to examples/server-async/DiffusersServer/utils/scheduler.py index a20715e254cd..848905985dd4 100644 --- a/examples/server-async/utils/scheduler.py +++ b/examples/server-async/DiffusersServer/utils/scheduler.py @@ -5,14 +5,16 @@ class BaseAsyncScheduler: def __init__(self, scheduler: Any): - pass - - def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device] = None): - # I leave it as an example of what the Scheduler should do to implement it later - """local = copy.deepcopy(self) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device) - return local""" - pass + self.scheduler = scheduler + + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device, None] = None, **kwargs): + local = copy.deepcopy(self.scheduler) + + local.set_timesteps(num_inference_steps=num_inference_steps, device=device, **kwargs) + + cloned = self.__class__(local) + + return cloned def async_retrieve_timesteps( From 7cfee776c9b6d71d60c5a95469e0e873a582c9d2 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Sun, 14 Sep 2025 20:48:40 -0600 Subject: [PATCH 22/34] Fix BaseAsyncScheduler --- .../DiffusersServer/utils/scheduler.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/examples/server-async/DiffusersServer/utils/scheduler.py b/examples/server-async/DiffusersServer/utils/scheduler.py index 848905985dd4..5925edfeab04 100644 --- a/examples/server-async/DiffusersServer/utils/scheduler.py +++ b/examples/server-async/DiffusersServer/utils/scheduler.py @@ -7,15 +7,32 @@ class BaseAsyncScheduler: def __init__(self, scheduler: Any): self.scheduler = scheduler + def __getattr__(self, name: str): + if hasattr(self.scheduler, name): + return getattr(self.scheduler, name) + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + def __setattr__(self, name: str, value): + if name == 'scheduler': + super().__setattr__(name, value) + else: + if hasattr(self, 'scheduler') and hasattr(self.scheduler, name): + setattr(self.scheduler, name, value) + else: + super().__setattr__(name, value) + def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device, None] = None, **kwargs): local = copy.deepcopy(self.scheduler) - local.set_timesteps(num_inference_steps=num_inference_steps, device=device, **kwargs) - cloned = self.__class__(local) - return cloned + def __repr__(self): + return f"BaseAsyncScheduler({repr(self.scheduler)})" + + def __str__(self): + return f"BaseAsyncScheduler wrapping: {str(self.scheduler)}" + def async_retrieve_timesteps( scheduler, From e574f07968ca2f2d47c69839378e37d7e8f09f61 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Mon, 15 Sep 2025 07:09:04 -0600 Subject: [PATCH 23/34] Rollback in the core of the diffusers --- src/diffusers/pipelines/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 86b4e22fb814..8ed07a72e3fd 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -522,7 +522,6 @@ DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin, - RequestScopedPipeline ) try: From 10496638912eceaf951d1d3718442489d0db70c0 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Mon, 15 Sep 2025 12:49:53 -0600 Subject: [PATCH 24/34] Update examples/server-async/README.md --- examples/server-async/README.md | 66 +++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index edf07852c247..59c8cd6eda62 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -1,11 +1,10 @@ # Asynchronous server and parallel execution of models > Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork. -> We recommend running about 10 to 50 inferences in parallel to have a good performance of 25-30s to 1-1:30min on average +> We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.) ## ⚠️ IMPORTANT -* This example uses a custom Diffusers fork: `https://github.com/F4k3r22/diffusers-async`. * The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`. The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU. @@ -15,7 +14,11 @@ All the components needed to create the inference server are in `DiffusersServer ``` DiffusersServer/ -├── **init**.py +├── utils/ +├─────── __init__.py +├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences +├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model +├── __init__.py ├── create_server.py # helper script to build/run the app programmatically ├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) ├── serverasync.py # FastAPI app factory (create\_app\_fastapi) @@ -29,10 +32,11 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r `diffusers-async` / this example addresses that by: * **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*. -* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. Where available we call `scheduler.clone_for_request(...)`, otherwise we fallback to safe `deepcopy` or other heuristics. -* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur. -* **`retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API. -* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. +* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. The system uses `BaseAsyncScheduler.clone_for_request(...)` for scheduler cloning, with fallback to safe `deepcopy` or other heuristics. +* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock with automatic tokenizer detection and wrapping. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur. +* **`async_retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API. +* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. Configurable tensor size threshold prevents cloning of large tensors. +* **Enhanced scheduler wrapping**: `BaseAsyncScheduler` automatically wraps schedulers with improved `__getattr__`, `__setattr__`, and debugging methods (`__repr__`, `__str__`). ## How the server works (high-level flow) @@ -41,10 +45,12 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r * The server uses `RequestScopedPipeline.generate(...)` which: + * automatically wraps the base scheduler in `BaseAsyncScheduler` (if not already wrapped), * obtains a *local scheduler* (via `clone_for_request()` or `deepcopy`), * does `local_pipe = copy.copy(base_pipe)` (shallow copy), * sets `local_pipe.scheduler = local_scheduler` (if possible), - * clones only small mutable attributes (callbacks, rng, small latents), + * clones only small mutable attributes (callbacks, rng, small latents) with auto-detection, + * wraps tokenizers with thread-safe locks to prevent race conditions, * optionally enters a `model_cpu_offload_context()` for memory offload hooks, * calls the pipeline on the local view (`local_pipe(...)`). 3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`). @@ -56,14 +62,10 @@ Core problem: a naive server that calls `pipe.__call__` concurrently can hit **r Recommended: create a virtualenv / conda environment. -If using the `diffusers` fork via git, either: - -**A) Preinstall the fork first:** - ```bash -pip install "git+https://github.com/F4k3r22/diffusers-async.git@main" +pip install diffusers pip install -r requirements.txt -```` +``` ### 2) Start the server @@ -93,16 +95,42 @@ Response example: } ``` +## Advanced Configuration + +### RequestScopedPipeline Parameters + +```python +RequestScopedPipeline( + pipeline, # Base pipeline to wrap + mutable_attrs=None, # Custom list of attributes to clone + auto_detect_mutables=True, # Enable automatic detection of mutable attributes + tensor_numel_threshold=1_000_000, # Tensor size threshold for cloning + tokenizer_lock=None, # Custom threading lock for tokenizers + wrap_scheduler=True # Auto-wrap scheduler in BaseAsyncScheduler +) +``` + +### BaseAsyncScheduler Features + +* Transparent proxy to the original scheduler with `__getattr__` and `__setattr__` +* `clone_for_request()` method for safe per-request scheduler cloning +* Enhanced debugging with `__repr__` and `__str__` methods +* Full compatibility with existing scheduler APIs + ## Troubleshooting (quick) * `Already borrowed` — previously a Rust tokenizer concurrency error. - ✅ This is now fixed: `RequestScopedPipeline` manages an internal tokenizer lock so race conditions no longer happen. + ✅ This is now fixed: `RequestScopedPipeline` automatically detects and wraps tokenizers with thread locks, so race conditions no longer happen. * `can't set attribute 'components'` — pipeline exposes read-only `components`. - - * The RequestScopedPipeline now detects read-only properties and skips setting them. + ✅ The RequestScopedPipeline now detects read-only properties and skips setting them automatically. * Scheduler issues: + * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `async_retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler. + ✅ Note: `async_retrieve_timesteps` is fully retro-compatible — if you don't pass `return_scheduler=True`, the behavior is unchanged. + +* Memory issues with large tensors: + ✅ The system now has configurable `tensor_numel_threshold` to prevent cloning of large tensors while still cloning small mutable ones. - * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler. - * ✅ Note: `retrieve_timesteps` is fully retro-compatible — if you don’t pass `return_scheduler=True`, the behavior is unchanged. +* Automatic tokenizer detection: + ✅ The system automatically identifies tokenizer components by checking for tokenizer methods, class names, and attributes, then applies thread-safe wrappers. \ No newline at end of file From 531662085d82911558ace92ff33e2236406f70ae Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Mon, 15 Sep 2025 13:01:28 -0600 Subject: [PATCH 25/34] Complete rollback of diffusers core files --- src/diffusers/pipelines/flux/pipeline_flux.py | 4 ++-- src/diffusers/pipelines/pipeline_utils.py | 2 -- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- .../stable_diffusion_3/pipeline_stable_diffusion_3.py | 1 + src/diffusers/schedulers/scheduling_amused.py | 1 - src/diffusers/schedulers/scheduling_consistency_decoder.py | 2 -- src/diffusers/schedulers/scheduling_consistency_models.py | 1 - .../schedulers/scheduling_cosine_dpmsolver_multistep.py | 1 - src/diffusers/schedulers/scheduling_ddim.py | 1 - src/diffusers/schedulers/scheduling_ddim_cogvideox.py | 1 - src/diffusers/schedulers/scheduling_ddim_inverse.py | 1 - src/diffusers/schedulers/scheduling_ddim_parallel.py | 1 - src/diffusers/schedulers/scheduling_ddpm.py | 1 - src/diffusers/schedulers/scheduling_ddpm_parallel.py | 1 - src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py | 1 - src/diffusers/schedulers/scheduling_deis_multistep.py | 2 -- src/diffusers/schedulers/scheduling_dpm_cogvideox.py | 1 - .../schedulers/scheduling_dpmsolver_multistep_inverse.py | 2 -- src/diffusers/schedulers/scheduling_dpmsolver_sde.py | 1 - src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py | 1 - .../schedulers/scheduling_edm_dpmsolver_multistep.py | 1 - src/diffusers/schedulers/scheduling_edm_euler.py | 1 - .../schedulers/scheduling_euler_ancestral_discrete.py | 1 - src/diffusers/schedulers/scheduling_euler_discrete.py | 1 - .../schedulers/scheduling_flow_match_euler_discrete.py | 1 - src/diffusers/schedulers/scheduling_sde_ve.py | 2 -- src/diffusers/schedulers/scheduling_tcd.py | 1 - src/diffusers/schedulers/scheduling_unclip.py | 1 - src/diffusers/schedulers/scheduling_unipc_multistep.py | 2 -- src/diffusers/schedulers/scheduling_vq_diffusion.py | 1 - 30 files changed, 4 insertions(+), 36 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py index 42d20472bf0b..5041e352f73d 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux.py @@ -13,8 +13,8 @@ # limitations under the License. import inspect -from typing import Any, Callable, Dict, List, Optional, Union, Tuple -import copy +from typing import Any, Callable, Dict, List, Optional, Union + import numpy as np import torch from transformers import ( diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index d311b5b6df20..01b3c56777c8 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -23,7 +23,6 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin - import numpy as np import PIL.Image import requests @@ -179,7 +178,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - class DiffusionPipeline(ConfigMixin, PushToHubMixin): r""" Base class for all pipelines. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 8023b4e77dc8..cb97f18efeff 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect from typing import Any, Callable, Dict, List, Optional, Union + import torch from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection @@ -150,7 +151,6 @@ def retrieve_timesteps( return timesteps, num_inference_steps - class StableDiffusionPipeline( DiffusionPipeline, StableDiffusionMixin, diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index 4c3975dca2a4..1618f89a49e3 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -14,6 +14,7 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Union + import torch from transformers import ( CLIPTextModelWithProjection, diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index c4b336811cf4..238b8d869171 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -160,4 +160,3 @@ def add_noise(self, sample, timesteps, generator=None): masked_sample[mask_indices] = self.config.mask_token_id return masked_sample - diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index acb24ea04d84..d7af018b284a 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -109,8 +109,6 @@ def set_timesteps( self.c_out = self.c_out.to(device) self.c_in = self.c_in.to(device) - - @property def init_noise_sigma(self): return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]] diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 56145cebcf6f..0f5062258800 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -243,7 +243,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Modified _convert_to_karras implementation that takes in ramp as argument def _convert_to_karras(self, ramp): """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py index 0752435240c3..66ed296da8ea 100644 --- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py @@ -241,7 +241,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # if a noise sampler is used, reinitialise it self.noise_sampler = None - # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index cd66070b69b6..5ee0d084f060 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -339,7 +339,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py index efc04dd5023f..c19efdc7834d 100644 --- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py @@ -302,7 +302,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 0ccf15828cee..49dba840d089 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -286,7 +286,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index e61fe866a1ae..7c3f03a8dbe1 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -362,7 +362,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 7cc0c4cef1f1..0fab6d910a82 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -322,7 +322,6 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) - def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 4d48b7c307fb..ec741f9ecb7d 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -332,7 +332,6 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 61143179329a..71f08277ebd7 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -161,7 +161,6 @@ def set_timesteps( timesteps = torch.Tensor(timesteps).to(device) self.timesteps = timesteps - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index e6581924e07d..7d8685ba10c3 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -317,8 +317,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py index b6398399763c..f7b63720e107 100644 --- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py +++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py @@ -303,7 +303,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) - def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None): lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log() lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log() diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 06ff3c6c573a..9ec958851111 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -329,8 +329,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._step_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 9777a9ff54ee..eeb06773d977 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -412,7 +412,6 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication self.noise_sampler = None - def _second_order_timesteps(self, sigmas, log_sigmas): def sigma_fn(_t): return np.exp(-_t) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 9cb72d021447..8663210a6244 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -407,7 +407,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index bff9b267a058..f1b38aaff56c 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -273,7 +273,6 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index c5e3d8145b0e..dbeff3de5652 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -261,7 +261,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index e9cb3107bbe9..9cdaa2c5e101 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -318,7 +318,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 513ef662820e..f58d918dbfbe 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -449,7 +449,6 @@ def set_timesteps( self._begin_index = None self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication - def _sigma_to_t(self, sigma, log_sigmas): # get log sigma log_sigma = np.log(np.maximum(sigma, 1e-10)) diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py index da4b69957097..1a4f12ddfa53 100644 --- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py @@ -348,7 +348,6 @@ def set_timesteps( self._step_index = None self._begin_index = None - def index_for_timestep(self, timestep, schedule_timesteps=None): if schedule_timesteps is None: schedule_timesteps = self.timesteps diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index 922a03a7fd34..1bfc08cce5e9 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -24,7 +24,6 @@ from ..utils import BaseOutput from ..utils.torch_utils import randn_tensor from .scheduling_utils import SchedulerMixin, SchedulerOutput -import copy @dataclass @@ -123,7 +122,6 @@ def set_timesteps( self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device) - def set_sigmas( self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None ): diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 06063ddd3bfc..3fd5c341eca9 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -521,7 +521,6 @@ def set_timesteps( self._step_index = None self._begin_index = None - def step( self, model_output: torch.Tensor, diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index b825102dfda9..d78efabfbc57 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -177,7 +177,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = torch.from_numpy(timesteps).to(device) - def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None): if prev_timestep is None: prev_timestep = t - 1 diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 38354555e9f3..162a34bd2774 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -429,8 +429,6 @@ def set_timesteps( self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 5369901b7656..57306301d023 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -197,7 +197,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.log_cumprod_bt = self.log_cumprod_bt.to(device) self.log_cumprod_ct = self.log_cumprod_ct.to(device) - def step( self, model_output: torch.Tensor, From 0ecdfc3ff5c3d4711b566332875ffe61cec2998b Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Tue, 16 Sep 2025 19:34:44 -0600 Subject: [PATCH 26/34] Simple implementation of an asynchronous server compatible with SD3-3.5 and Flux Pipelines --- .../server-async/DiffusersServer/__init__.py | 2 - .../DiffusersServer/create_server.py | 45 --- .../DiffusersServer/serverasync.py | 340 ------------------ .../DiffusersServer/utils/__init__.py | 1 - .../DiffusersServer/uvicorn_diffu.py | 66 ---- .../{DiffusersServer => }/Pipelines.py | 68 ++-- examples/server-async/README.md | 2 +- examples/server-async/server.py | 11 - examples/server-async/serverasync.py | 223 ++++++++++++ examples/server-async/test.py | 2 +- examples/server-async/utils/__init__.py | 2 + .../utils/requestscopedpipeline.py | 0 .../{DiffusersServer => }/utils/scheduler.py | 0 examples/server-async/utils/utils.py | 44 +++ 14 files changed, 315 insertions(+), 491 deletions(-) delete mode 100644 examples/server-async/DiffusersServer/__init__.py delete mode 100644 examples/server-async/DiffusersServer/create_server.py delete mode 100644 examples/server-async/DiffusersServer/serverasync.py delete mode 100644 examples/server-async/DiffusersServer/utils/__init__.py delete mode 100644 examples/server-async/DiffusersServer/uvicorn_diffu.py rename examples/server-async/{DiffusersServer => }/Pipelines.py (59%) delete mode 100644 examples/server-async/server.py create mode 100644 examples/server-async/serverasync.py create mode 100644 examples/server-async/utils/__init__.py rename examples/server-async/{DiffusersServer => }/utils/requestscopedpipeline.py (100%) rename examples/server-async/{DiffusersServer => }/utils/scheduler.py (100%) create mode 100644 examples/server-async/utils/utils.py diff --git a/examples/server-async/DiffusersServer/__init__.py b/examples/server-async/DiffusersServer/__init__.py deleted file mode 100644 index 0d8d5761a939..000000000000 --- a/examples/server-async/DiffusersServer/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .Pipelines import TextToImagePipelineSD3 -from .create_server import create_inference_server_Async as DiffusersServerApp \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/create_server.py b/examples/server-async/DiffusersServer/create_server.py deleted file mode 100644 index 7ccfd9c742f8..000000000000 --- a/examples/server-async/DiffusersServer/create_server.py +++ /dev/null @@ -1,45 +0,0 @@ -# create_server.py - -from .Pipelines import * -from .serverasync import * -from .uvicorn_diffu import * -import asyncio - -def create_inference_server_Async( - model:str, - type_model: str = 't2im', - host: str = '0.0.0.0', - port: int = 8500, - threads=5, - enable_memory_monitor=True, - custom_model: bool = False, - custom_pipeline: Optional[Type] | None = None, - constructor_pipeline: Optional[Type] | None = None, - components: Optional[Dict[str, Any]] = None, - api_name: Optional[str] = 'custom_api', - torch_dtype = torch.bfloat16 -): - config = ServerConfigModels( - model=model, - type_models=type_model, - custom_model=custom_model, - custom_pipeline=custom_pipeline, - constructor_pipeline=constructor_pipeline, - components=components, - api_name=api_name, - torch_dtype=torch_dtype, - host=host, - port=port - ) - - app = create_app_fastapi(config) - - asyncio.run(run_uvicorn_server( - app, - host=host, - port=port, - workers=threads, - enable_memory_monitor=enable_memory_monitor - )) - - return app \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/serverasync.py b/examples/server-async/DiffusersServer/serverasync.py deleted file mode 100644 index d345db595838..000000000000 --- a/examples/server-async/DiffusersServer/serverasync.py +++ /dev/null @@ -1,340 +0,0 @@ -from fastapi import FastAPI, HTTPException, Request -from fastapi.responses import FileResponse -from fastapi.middleware.cors import CORSMiddleware -from fastapi.concurrency import run_in_threadpool -from pydantic import BaseModel -from .Pipelines import TextToImagePipelineSD3, TextToImagePipelineFlux, TextToImagePipelineSD, logger -import logging -from .utils import RequestScopedPipeline -from diffusers import * -import random -import uuid -import tempfile -from dataclasses import dataclass -import os -import torch -import threading -import gc -from typing import Optional, Dict, Any, Type -from dataclasses import dataclass, field -from typing import List -from contextlib import asynccontextmanager -import asyncio - -@dataclass -class PresetModels: - SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) - SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium']) - Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell']) - -class ModelPipelineInitializer: - def __init__(self, model: str = '', type_models: str = 't2im'): - self.model = model - self.type_models = type_models - self.pipeline = None - self.device = "cuda" if torch.cuda.is_available() else "mps" - self.model_type = None - - def initialize_pipeline(self): - if not self.model: - raise ValueError("Model name not provided") - - # Check if model exists in PresetModels - preset_models = PresetModels() - - # Determine which model type we're dealing with - if self.model in preset_models.SD3: - self.model_type = "SD3" - elif self.model in preset_models.SD3_5: - self.model_type = "SD3_5" - elif self.model in preset_models.Flux: - self.model_type = "Flux" - else: - self.model_type = "SD" - - # Create appropriate pipeline based on model type and type_models - if self.type_models == 't2im': - if self.model_type in ["SD3", "SD3_5"]: - self.pipeline = TextToImagePipelineSD3(self.model) - elif self.model_type == "Flux": - self.pipeline = TextToImagePipelineFlux(self.model) - elif self.model_type == "SD": - self.pipeline = TextToImagePipelineSD(self.model) - else: - raise ValueError(f"Model type {self.model_type} not supported for text-to-image") - elif self.type_models == 't2v': - raise ValueError(f"Unsupported type_models: {self.type_models}") - - return self.pipeline - -class Utils: - def __init__(self, host: str = '0.0.0.0', port: int = 8500): - self.service_url = f"http://{host}:{port}" - self.image_dir = os.path.join(tempfile.gettempdir(), "images") - if not os.path.exists(self.image_dir): - os.makedirs(self.image_dir) - - self.video_dir = os.path.join(tempfile.gettempdir(), "videos") - if not os.path.exists(self.video_dir): - os.makedirs(self.video_dir) - - def save_image(self, image): - if hasattr(image, "to"): - try: - image = image.to("cpu") - except Exception: - pass - - if isinstance(image, torch.Tensor): - from torchvision import transforms - to_pil = transforms.ToPILImage() - image = to_pil(image.squeeze(0).clamp(0, 1)) - - filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" - image_path = os.path.join(self.image_dir, filename) - logger.info(f"Saving image to {image_path}") - - image.save(image_path, format="PNG", optimize=True) - - del image - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - return os.path.join(self.service_url, "images", filename) - -@dataclass -class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' - type_models: str = 't2im' - custom_model : bool = False - constructor_pipeline: Optional[Type] = None - custom_pipeline: Optional[Type] = None - components: Optional[Dict[str, Any]] = None - api_name: Optional[str] = 'custom_api' - torch_dtype: Optional[torch.dtype] = None - host: str = '0.0.0.0' - port: int = 8500 - -def create_app_fastapi(config: ServerConfigModels) -> FastAPI: - - server_config = config or ServerConfigModels() - - @asynccontextmanager - async def lifespan(app: FastAPI): - logging.basicConfig(level=logging.INFO) - app.state.logger = logging.getLogger("diffusers-server") - os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True' - os.environ['CUDA_LAUNCH_BLOCKING'] = '0' - - app.state.total_requests = 0 - app.state.active_inferences = 0 - app.state.metrics_lock = asyncio.Lock() - app.state.metrics_task = None - - app.state.utils_app = Utils( - host=server_config.host, - port=server_config.port, - ) - - async def metrics_loop(): - try: - while True: - async with app.state.metrics_lock: - total = app.state.total_requests - active = app.state.active_inferences - app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}") - await asyncio.sleep(5) - except asyncio.CancelledError: - app.state.logger.info("Metrics loop cancelled") - raise - - app.state.metrics_task = asyncio.create_task(metrics_loop()) - - try: - yield - finally: - task = app.state.metrics_task - if task: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - try: - stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None) - if callable(stop_fn): - await run_in_threadpool(stop_fn) - except Exception as e: - app.state.logger.warning(f"Error during pipeline shutdown: {e}") - - app.state.logger.info("Lifespan shutdown complete") - - app = FastAPI(lifespan=lifespan) - - logger = logging.getLogger("DiffusersServer.Pipelines") - - if server_config.custom_model: - if server_config.constructor_pipeline is None: - raise ValueError("constructor_pipeline cannot be None - a valid pipeline constructor is required") - - initializer = server_config.constructor_pipeline( - model_path=server_config.model, - pipeline=server_config.custom_pipeline, - torch_dtype=server_config.torch_dtype, - components=server_config.components, - ) - model_pipeline = initializer.start() - request_pipe = None - pipeline_lock = threading.Lock() - - else: - initializer = ModelPipelineInitializer( - model=server_config.model, - type_models=server_config.type_models, - ) - model_pipeline = initializer.initialize_pipeline() - model_pipeline.start() - - request_pipe = RequestScopedPipeline(model_pipeline.pipeline) - pipeline_lock = threading.Lock() - - logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})") - - app.state.MODEL_INITIALIZER = initializer - app.state.MODEL_PIPELINE = model_pipeline - app.state.REQUEST_PIPE = request_pipe - app.state.PIPELINE_LOCK = pipeline_lock - - class JSONBodyQueryAPI(BaseModel): - model : str | None = None - prompt : str - negative_prompt : str | None = None - num_inference_steps : int = 28 - num_images_per_prompt : int = 1 - - @app.middleware("http") - async def count_requests_middleware(request: Request, call_next): - async with app.state.metrics_lock: - app.state.total_requests += 1 - response = await call_next(request) - return response - - - @app.get("/") - async def root(): - return {"message": "Welcome to the Diffusers Server"} - - @app.post("/api/diffusers/inference") - async def api(json: JSONBodyQueryAPI): - prompt = json.prompt - negative_prompt = json.negative_prompt or "" - num_steps = json.num_inference_steps - num_images_per_prompt = json.num_images_per_prompt - - wrapper = app.state.MODEL_PIPELINE - initializer = app.state.MODEL_INITIALIZER - - utils_app = app.state.utils_app - - - if not wrapper or not wrapper.pipeline: - raise HTTPException(500, "Model not initialized correctly") - if not prompt.strip(): - raise HTTPException(400, "No prompt provided") - - - def make_generator(): - g = torch.Generator(device=initializer.device) - return g.manual_seed(random.randint(0, 10_000_000)) - - req_pipe = app.state.REQUEST_PIPE - - def infer(): - gen = make_generator() - return req_pipe.generate( - prompt=prompt, - negative_prompt=negative_prompt, - generator=gen, - num_inference_steps=num_steps, - num_images_per_prompt=num_images_per_prompt, - device=initializer.device, - output_type="pil", - ) - - try: - async with app.state.metrics_lock: - app.state.active_inferences += 1 - - output = await run_in_threadpool(infer) - - async with app.state.metrics_lock: - app.state.active_inferences = max(0, app.state.active_inferences - 1) - - urls = [utils_app.save_image(img) for img in output.images] - return {"response": urls} - - except Exception as e: - async with app.state.metrics_lock: - app.state.active_inferences = max(0, app.state.active_inferences - 1) - logger.error(f"Error during inference: {e}") - raise HTTPException(500, f"Error in processing: {e}") - - finally: - if torch.cuda.is_available(): - torch.cuda.synchronize() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - torch.cuda.ipc_collect() - gc.collect() - - - @app.get("/images/{filename}") - async def serve_image(filename: str): - utils_app = app.state.utils_app - file_path = os.path.join(utils_app.image_dir, filename) - if not os.path.isfile(file_path): - raise HTTPException(status_code=404, detail="Image not found") - return FileResponse(file_path, media_type="image/png") - - @app.get("/api/models") - async def list_models(): - return { - "current_model" : server_config.model, - "type" : server_config.type_models, - "all_models": { - "type": "T2Img", - "SD3": PresetModels().SD3, - "SD3_5": PresetModels().SD3_5, - "Flux": PresetModels().Flux, - } - } - - @app.get("/api/status") - async def get_status(): - memory_info = {} - if torch.cuda.is_available(): - memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB - memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB - memory_info = { - "memory_allocated_gb": round(memory_allocated, 2), - "memory_reserved_gb": round(memory_reserved, 2), - "device": torch.cuda.get_device_name(0) - } - - return { - "current_model" : server_config.model, - "type_models" : server_config.type_models, - "memory" : memory_info} - - - app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - return app \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/utils/__init__.py b/examples/server-async/DiffusersServer/utils/__init__.py deleted file mode 100644 index 38b01f7aa59d..000000000000 --- a/examples/server-async/DiffusersServer/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .requestscopedpipeline import RequestScopedPipeline \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/uvicorn_diffu.py b/examples/server-async/DiffusersServer/uvicorn_diffu.py deleted file mode 100644 index c2688e25497d..000000000000 --- a/examples/server-async/DiffusersServer/uvicorn_diffu.py +++ /dev/null @@ -1,66 +0,0 @@ -import uvicorn -import logging -import gc -import psutil -import os -import threading -import time - -def setup_logging(): - logging.basicConfig(level=logging.INFO) - return logging.getLogger('uvicorn') - -logger = setup_logging() - -def memory_cleanup(interval=30): - while True: - try: - gc.collect() - - process = psutil.Process(os.getpid()) - mem = process.memory_info().rss / 1024 / 1024 - logger.info(f"Memory in use: {mem:.2f} MB") - - time.sleep(interval) - except Exception as e: - logger.error(f"Memory clearing error: {str(e)}") - time.sleep(interval) - -def run_uvicorn_server( - app, - host='0.0.0.0', - port=8500, - workers=5, - cleanup_interval=30, - channel_timeout=900, - headers=[ - ("server", "DiffusersServer") - ], - enable_memory_monitor=True -): - gc.enable() - gc.set_threshold(700, 10, 5) - - if enable_memory_monitor: - cleanup_thread = threading.Thread( - target=memory_cleanup, - args=(cleanup_interval,), - daemon=True - ) - cleanup_thread.start() - logger.info("Memory monitor activated") - - logger.info(f"Starting Uvicorn server in {host}:{port}...") - - config = uvicorn.Config( - app=app, - host=host, - workers=workers, - port=port, - timeout_keep_alive=channel_timeout, - headers=headers - ) - - server = uvicorn.Server(config) - - return server.serve() \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/Pipelines.py b/examples/server-async/Pipelines.py similarity index 59% rename from examples/server-async/DiffusersServer/Pipelines.py rename to examples/server-async/Pipelines.py index bc60d4811c3e..dcf5f6eed596 100644 --- a/examples/server-async/DiffusersServer/Pipelines.py +++ b/examples/server-async/Pipelines.py @@ -1,11 +1,12 @@ # Pipelines.py from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline from diffusers.pipelines.flux.pipeline_flux import FluxPipeline -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline import torch import os import logging from pydantic import BaseModel +from dataclasses import dataclass, field +from typing import List logger = logging.getLogger(__name__) @@ -15,6 +16,13 @@ class TextToImageInput(BaseModel): size: str | None = None n: int | None = None + +@dataclass +class PresetModels: + SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) + SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium']) + Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell']) + class TextToImagePipelineSD3: def __init__(self, model_path: str | None = None): self.model_path = model_path or os.getenv("MODEL_PATH") @@ -72,28 +80,40 @@ def start(self): else: raise Exception("No CUDA or MPS device available") -class TextToImagePipelineSD: - def __init__(self, model_path: str | None = None): - self.model_path = model_path or os.getenv("MODEL_PATH") - self.pipeline: StableDiffusionPipeline | None = None - self.device: str | None = None +class ModelPipelineInitializer: + def __init__(self, model: str = '', type_models: str = 't2im'): + self.model = model + self.type_models = type_models + self.pipeline = None + self.device = "cuda" if torch.cuda.is_available() else "mps" + self.model_type = None - def start(self): - if torch.cuda.is_available(): - model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" - logger.info("Loading CUDA") - self.device = "cuda" - self.pipeline = StableDiffusionPipeline.from_pretrained( - model_path, - torch_dtype=torch.float16, - ).to(device=self.device) - elif torch.backends.mps.is_available(): - model_path = self.model_path or "sd-legacy/stable-diffusion-v1-5" - logger.info("Loading MPS for Mac M Series") - self.device = "mps" - self.pipeline = StableDiffusionPipeline.from_pretrained( - model_path, - torch_dtype=torch.float16, - ).to(device=self.device) + def initialize_pipeline(self): + if not self.model: + raise ValueError("Model name not provided") + + # Check if model exists in PresetModels + preset_models = PresetModels() + + # Determine which model type we're dealing with + if self.model in preset_models.SD3: + self.model_type = "SD3" + elif self.model in preset_models.SD3_5: + self.model_type = "SD3_5" + elif self.model in preset_models.Flux: + self.model_type = "Flux" else: - raise Exception("No CUDA or MPS device available") + self.model_type = "SD" + + # Create appropriate pipeline based on model type and type_models + if self.type_models == 't2im': + if self.model_type in ["SD3", "SD3_5"]: + self.pipeline = TextToImagePipelineSD3(self.model) + elif self.model_type == "Flux": + self.pipeline = TextToImagePipelineFlux(self.model) + else: + raise ValueError(f"Model type {self.model_type} not supported for text-to-image") + elif self.type_models == 't2v': + raise ValueError(f"Unsupported type_models: {self.type_models}") + + return self.pipeline \ No newline at end of file diff --git a/examples/server-async/README.md b/examples/server-async/README.md index 59c8cd6eda62..ce5b01724729 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -1,6 +1,6 @@ # Asynchronous server and parallel execution of models -> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines and a custom `diffusers` fork. +> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines. > We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.) ## ⚠️ IMPORTANT diff --git a/examples/server-async/server.py b/examples/server-async/server.py deleted file mode 100644 index 590522038a53..000000000000 --- a/examples/server-async/server.py +++ /dev/null @@ -1,11 +0,0 @@ -# DiffusersServerApp already handles the inference server and everything else internally, you -# just need to do these basic configurations and run the script with "python server.py" -# and you already get access to the inference APIs. -from DiffusersServer import DiffusersServerApp - -app = DiffusersServerApp( - model='stabilityai/stable-diffusion-3.5-medium', - type_model='t2im', - threads=3, - enable_memory_monitor=True -) \ No newline at end of file diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py new file mode 100644 index 000000000000..0cf1724e70ac --- /dev/null +++ b/examples/server-async/serverasync.py @@ -0,0 +1,223 @@ +from fastapi import FastAPI, HTTPException, Request +from fastapi.responses import FileResponse +from fastapi.middleware.cors import CORSMiddleware +from fastapi.concurrency import run_in_threadpool +from pydantic import BaseModel +from .Pipelines import ModelPipelineInitializer +from .utils import Utils, RequestScopedPipeline +import logging +from diffusers import * +import random +from dataclasses import dataclass +import os +import torch +import threading +import gc +from typing import Optional, Dict, Any, Type +from contextlib import asynccontextmanager +import asyncio + + +@dataclass +class ServerConfigModels: + model: str = 'stabilityai/stable-diffusion-3-medium' + type_models: str = 't2im' + constructor_pipeline: Optional[Type] = None + custom_pipeline: Optional[Type] = None + components: Optional[Dict[str, Any]] = None + torch_dtype: Optional[torch.dtype] = None + host: str = '0.0.0.0' + port: int = 8500 + +server_config = ServerConfigModels() + +@asynccontextmanager +async def lifespan(app: FastAPI): + logging.basicConfig(level=logging.INFO) + app.state.logger = logging.getLogger("diffusers-server") + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True' + os.environ['CUDA_LAUNCH_BLOCKING'] = '0' + + app.state.total_requests = 0 + app.state.active_inferences = 0 + app.state.metrics_lock = asyncio.Lock() + app.state.metrics_task = None + + app.state.utils_app = Utils( + host=server_config.host, + port=server_config.port, + ) + + async def metrics_loop(): + try: + while True: + async with app.state.metrics_lock: + total = app.state.total_requests + active = app.state.active_inferences + app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}") + await asyncio.sleep(5) + except asyncio.CancelledError: + app.state.logger.info("Metrics loop cancelled") + raise + + app.state.metrics_task = asyncio.create_task(metrics_loop()) + + try: + yield + finally: + task = app.state.metrics_task + if task: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + try: + stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None) + if callable(stop_fn): + await run_in_threadpool(stop_fn) + except Exception as e: + app.state.logger.warning(f"Error during pipeline shutdown: {e}") + + app.state.logger.info("Lifespan shutdown complete") + +app = FastAPI(lifespan=lifespan) + +logger = logging.getLogger("DiffusersServer.Pipelines") + + + +initializer = ModelPipelineInitializer( + model=server_config.model, + type_models=server_config.type_models, +) +model_pipeline = initializer.initialize_pipeline() +model_pipeline.start() + +request_pipe = RequestScopedPipeline(model_pipeline.pipeline) +pipeline_lock = threading.Lock() + +logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})") + +app.state.MODEL_INITIALIZER = initializer +app.state.MODEL_PIPELINE = model_pipeline +app.state.REQUEST_PIPE = request_pipe +app.state.PIPELINE_LOCK = pipeline_lock + +class JSONBodyQueryAPI(BaseModel): + model : str | None = None + prompt : str + negative_prompt : str | None = None + num_inference_steps : int = 28 + num_images_per_prompt : int = 1 + +@app.middleware("http") +async def count_requests_middleware(request: Request, call_next): + async with app.state.metrics_lock: + app.state.total_requests += 1 + response = await call_next(request) + return response + + +@app.get("/") +async def root(): + return {"message": "Welcome to the Diffusers Server"} + +@app.post("/api/diffusers/inference") +async def api(json: JSONBodyQueryAPI): + prompt = json.prompt + negative_prompt = json.negative_prompt or "" + num_steps = json.num_inference_steps + num_images_per_prompt = json.num_images_per_prompt + + wrapper = app.state.MODEL_PIPELINE + initializer = app.state.MODEL_INITIALIZER + + utils_app = app.state.utils_app + + + if not wrapper or not wrapper.pipeline: + raise HTTPException(500, "Model not initialized correctly") + if not prompt.strip(): + raise HTTPException(400, "No prompt provided") + + + def make_generator(): + g = torch.Generator(device=initializer.device) + return g.manual_seed(random.randint(0, 10_000_000)) + + req_pipe = app.state.REQUEST_PIPE + + def infer(): + gen = make_generator() + return req_pipe.generate( + prompt=prompt, + negative_prompt=negative_prompt, + generator=gen, + num_inference_steps=num_steps, + num_images_per_prompt=num_images_per_prompt, + device=initializer.device, + output_type="pil", + ) + + try: + async with app.state.metrics_lock: + app.state.active_inferences += 1 + + output = await run_in_threadpool(infer) + + async with app.state.metrics_lock: + app.state.active_inferences = max(0, app.state.active_inferences - 1) + + urls = [utils_app.save_image(img) for img in output.images] + return {"response": urls} + + except Exception as e: + async with app.state.metrics_lock: + app.state.active_inferences = max(0, app.state.active_inferences - 1) + logger.error(f"Error during inference: {e}") + raise HTTPException(500, f"Error in processing: {e}") + + finally: + if torch.cuda.is_available(): + torch.cuda.synchronize() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + torch.cuda.ipc_collect() + gc.collect() + + +@app.get("/images/{filename}") +async def serve_image(filename: str): + utils_app = app.state.utils_app + file_path = os.path.join(utils_app.image_dir, filename) + if not os.path.isfile(file_path): + raise HTTPException(status_code=404, detail="Image not found") + return FileResponse(file_path, media_type="image/png") + +@app.get("/api/status") +async def get_status(): + memory_info = {} + if torch.cuda.is_available(): + memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB + memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB + memory_info = { + "memory_allocated_gb": round(memory_allocated, 2), + "memory_reserved_gb": round(memory_reserved, 2), + "device": torch.cuda.get_device_name(0) + } + + return { + "current_model" : server_config.model, + "type_models" : server_config.type_models, + "memory" : memory_info} + + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) diff --git a/examples/server-async/test.py b/examples/server-async/test.py index 2a68c77bb28f..2c27146d0bd0 100644 --- a/examples/server-async/test.py +++ b/examples/server-async/test.py @@ -5,7 +5,7 @@ SERVER_URL = "http://localhost:8500/api/diffusers/inference" BASE_URL = "http://localhost:8500" -DOWNLOAD_FOLDER = "imagenes_generadas" +DOWNLOAD_FOLDER = "generated_images" WAIT_BEFORE_DOWNLOAD = 2 # seconds os.makedirs(DOWNLOAD_FOLDER, exist_ok=True) diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py new file mode 100644 index 000000000000..741cd9bb0219 --- /dev/null +++ b/examples/server-async/utils/__init__.py @@ -0,0 +1,2 @@ +from .requestscopedpipeline import RequestScopedPipeline +from .utils import Utils \ No newline at end of file diff --git a/examples/server-async/DiffusersServer/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py similarity index 100% rename from examples/server-async/DiffusersServer/utils/requestscopedpipeline.py rename to examples/server-async/utils/requestscopedpipeline.py diff --git a/examples/server-async/DiffusersServer/utils/scheduler.py b/examples/server-async/utils/scheduler.py similarity index 100% rename from examples/server-async/DiffusersServer/utils/scheduler.py rename to examples/server-async/utils/scheduler.py diff --git a/examples/server-async/utils/utils.py b/examples/server-async/utils/utils.py new file mode 100644 index 000000000000..e3dbb45677e1 --- /dev/null +++ b/examples/server-async/utils/utils.py @@ -0,0 +1,44 @@ +import os +import tempfile +import torch +import uuid +import gc +import logging + +logger = logging.getLogger(__name__) + +class Utils: + def __init__(self, host: str = '0.0.0.0', port: int = 8500): + self.service_url = f"http://{host}:{port}" + self.image_dir = os.path.join(tempfile.gettempdir(), "images") + if not os.path.exists(self.image_dir): + os.makedirs(self.image_dir) + + self.video_dir = os.path.join(tempfile.gettempdir(), "videos") + if not os.path.exists(self.video_dir): + os.makedirs(self.video_dir) + + def save_image(self, image): + if hasattr(image, "to"): + try: + image = image.to("cpu") + except Exception: + pass + + if isinstance(image, torch.Tensor): + from torchvision import transforms + to_pil = transforms.ToPILImage() + image = to_pil(image.squeeze(0).clamp(0, 1)) + + filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png" + image_path = os.path.join(self.image_dir, filename) + logger.info(f"Saving image to {image_path}") + + image.save(image_path, format="PNG", optimize=True) + + del image + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return os.path.join(self.service_url, "images", filename) \ No newline at end of file From ac5c9e6d3a6c3014741a974ad48550234896df86 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Tue, 16 Sep 2025 19:48:26 -0600 Subject: [PATCH 27/34] Update examples/server-async/README.md --- examples/server-async/Pipelines.py | 3 -- examples/server-async/README.md | 63 +++++++++++++++++++++------- examples/server-async/serverasync.py | 5 +++ 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py index dcf5f6eed596..d0012251da5d 100644 --- a/examples/server-async/Pipelines.py +++ b/examples/server-async/Pipelines.py @@ -1,4 +1,3 @@ -# Pipelines.py from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline from diffusers.pipelines.flux.pipeline_flux import FluxPipeline import torch @@ -102,8 +101,6 @@ def initialize_pipeline(self): self.model_type = "SD3_5" elif self.model in preset_models.Flux: self.model_type = "Flux" - else: - self.model_type = "SD" # Create appropriate pipeline based on model type and type_models if self.type_models == 't2im': diff --git a/examples/server-async/README.md b/examples/server-async/README.md index ce5b01724729..6842d59486e9 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -5,24 +5,24 @@ ## ⚠️ IMPORTANT -* The server and inference harness live in this repo: `https://github.com/F4k3r22/DiffusersServer`. - The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU. +* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU. ## Necessary components -All the components needed to create the inference server are in `DiffusersServer/` +All the components needed to create the inference server are in the current directory: ``` -DiffusersServer/ +server-async/ ├── utils/ ├─────── __init__.py -├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences -├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model -├── __init__.py -├── create_server.py # helper script to build/run the app programmatically -├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) -├── serverasync.py # FastAPI app factory (create\_app\_fastapi) -├── uvicorn_diffu.py # convenience script to start uvicorn with recommended flags +├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences +├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model +├─────── utils.py # Image/video saving utilities and service configuration +├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) +├── serverasync.py # FastAPI app with lifespan management and async inference endpoints +├── test.py # Client test script for inference requests +├── requirements.txt # Dependencies +└── README.md # This documentation ``` ## What `diffusers-async` adds / Why we needed it @@ -69,13 +69,28 @@ pip install -r requirements.txt ### 2) Start the server -Using the `server.py` file that already has everything you need: +Using the `serverasync.py` file that already has everything you need: ```bash -python server.py +python serverasync.py ``` -### 3) Example request +The server will start on `http://localhost:8500` by default with the following features: +- FastAPI application with async lifespan management +- Automatic model loading and pipeline initialization +- Request counting and active inference tracking +- Memory cleanup after each inference +- CORS middleware for cross-origin requests + +### 3) Test the server + +Use the included test script: + +```bash +python test.py +``` + +Or send a manual request: `POST /api/diffusers/inference` with JSON body: @@ -95,6 +110,13 @@ Response example: } ``` +### 4) Server endpoints + +- `GET /` - Welcome message +- `POST /api/diffusers/inference` - Main inference endpoint +- `GET /images/{filename}` - Serve generated images +- `GET /api/status` - Server status and memory info + ## Advanced Configuration ### RequestScopedPipeline Parameters @@ -117,6 +139,19 @@ RequestScopedPipeline( * Enhanced debugging with `__repr__` and `__str__` methods * Full compatibility with existing scheduler APIs +### Server Configuration + +The server configuration can be modified in `serverasync.py` through the `ServerConfigModels` dataclass: + +```python +@dataclass +class ServerConfigModels: + model: str = 'stabilityai/stable-diffusion-3-medium' + type_models: str = 't2im' + host: str = '0.0.0.0' + port: int = 8500 +``` + ## Troubleshooting (quick) * `Already borrowed` — previously a Rust tokenizer concurrency error. diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py index 0cf1724e70ac..1723eb119849 100644 --- a/examples/server-async/serverasync.py +++ b/examples/server-async/serverasync.py @@ -221,3 +221,8 @@ async def get_status(): allow_methods=["*"], allow_headers=["*"], ) + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host=server_config.host, port=server_config.port) \ No newline at end of file From 72e021564da2dc7b5f395e9d7e7d0c6c04522c68 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Tue, 16 Sep 2025 20:07:38 -0600 Subject: [PATCH 28/34] Fixed import errors in 'examples/server-async/serverasync.py' --- examples/server-async/README.md | 2 +- examples/server-async/serverasync.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index 6842d59486e9..d3feb9a092ab 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -146,7 +146,7 @@ The server configuration can be modified in `serverasync.py` through the `Server ```python @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' + model: str = 'stabilityai/stable-diffusion-3.5-medium' type_models: str = 't2im' host: str = '0.0.0.0' port: int = 8500 diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py index 1723eb119849..4f114f93d63f 100644 --- a/examples/server-async/serverasync.py +++ b/examples/server-async/serverasync.py @@ -3,10 +3,9 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel -from .Pipelines import ModelPipelineInitializer -from .utils import Utils, RequestScopedPipeline +from Pipelines import ModelPipelineInitializer +from utils import Utils, RequestScopedPipeline import logging -from diffusers import * import random from dataclasses import dataclass import os @@ -20,7 +19,7 @@ @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3-medium' + model: str = 'stabilityai/stable-diffusion-3.5-medium' type_models: str = 't2im' constructor_pipeline: Optional[Type] = None custom_pipeline: Optional[Type] = None From edd550ba5dcaabafececbf049036203716b319ef Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 17 Sep 2025 10:34:58 -0600 Subject: [PATCH 29/34] Flux Pipeline Discard --- examples/server-async/Pipelines.py | 37 ------------------------------ 1 file changed, 37 deletions(-) diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py index d0012251da5d..c30669d26e99 100644 --- a/examples/server-async/Pipelines.py +++ b/examples/server-async/Pipelines.py @@ -1,5 +1,4 @@ from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline -from diffusers.pipelines.flux.pipeline_flux import FluxPipeline import torch import os import logging @@ -20,7 +19,6 @@ class TextToImageInput(BaseModel): class PresetModels: SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium']) - Flux: List[str] = field(default_factory=lambda: ['black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-schnell']) class TextToImagePipelineSD3: def __init__(self, model_path: str | None = None): @@ -48,37 +46,6 @@ def start(self): else: raise Exception("No CUDA or MPS device available") -class TextToImagePipelineFlux: - def __init__(self, model_path: str | None = None, low_vram: bool = False): - self.model_path = model_path or os.getenv("MODEL_PATH") - self.pipeline: FluxPipeline | None = None - self.device: str | None = None - self.low_vram = low_vram - - def start(self): - if torch.cuda.is_available(): - model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" - logger.info("Loading CUDA") - self.device = "cuda" - self.pipeline = FluxPipeline.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - ).to(device=self.device) - if self.low_vram: - self.pipeline.enable_model_cpu_offload() - else: - pass - elif torch.backends.mps.is_available(): - model_path = self.model_path or "black-forest-labs/FLUX.1-schnell" - logger.info("Loading MPS for Mac M Series") - self.device = "mps" - self.pipeline = FluxPipeline.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - ).to(device=self.device) - else: - raise Exception("No CUDA or MPS device available") - class ModelPipelineInitializer: def __init__(self, model: str = '', type_models: str = 't2im'): self.model = model @@ -99,15 +66,11 @@ def initialize_pipeline(self): self.model_type = "SD3" elif self.model in preset_models.SD3_5: self.model_type = "SD3_5" - elif self.model in preset_models.Flux: - self.model_type = "Flux" # Create appropriate pipeline based on model type and type_models if self.type_models == 't2im': if self.model_type in ["SD3", "SD3_5"]: self.pipeline = TextToImagePipelineSD3(self.model) - elif self.model_type == "Flux": - self.pipeline = TextToImagePipelineFlux(self.model) else: raise ValueError(f"Model type {self.model_type} not supported for text-to-image") elif self.type_models == 't2v': From 6b693673e460ed1ae7a9a9b38437a5df323709ce Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Wed, 17 Sep 2025 12:05:11 -0600 Subject: [PATCH 30/34] Update examples/server-async/README.md --- examples/server-async/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server-async/README.md b/examples/server-async/README.md index d3feb9a092ab..a47ab7c7f224 100644 --- a/examples/server-async/README.md +++ b/examples/server-async/README.md @@ -1,11 +1,11 @@ # Asynchronous server and parallel execution of models -> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3/Flux pipelines. +> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3 pipelines. > We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.) ## ⚠️ IMPORTANT -* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` and `Flux.1` concurrently while keeping a single copy of the heavy model parameters on GPU. +* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` concurrently while keeping a single copy of the heavy model parameters on GPU. ## Necessary components @@ -18,7 +18,7 @@ server-async/ ├─────── scheduler.py # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences ├─────── requestscopedpipeline.py # RequestScoped Pipeline for inference with a single in-memory model ├─────── utils.py # Image/video saving utilities and service configuration -├── Pipelines.py # pipeline loader classes (SD3, Flux, legacy SD, video) +├── Pipelines.py # pipeline loader classes (SD3) ├── serverasync.py # FastAPI app with lifespan management and async inference endpoints ├── test.py # Client test script for inference requests ├── requirements.txt # Dependencies From 7c4f88348a8d3536a4568398e8f1f81cabab1ddc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 18 Sep 2025 04:07:16 +0000 Subject: [PATCH 31/34] Apply style fixes --- examples/server-async/Pipelines.py | 34 +++++--- examples/server-async/serverasync.py | 85 ++++++++++--------- examples/server-async/test.py | 7 +- examples/server-async/utils/__init__.py | 2 +- .../utils/requestscopedpipeline.py | 47 +++++----- examples/server-async/utils/scheduler.py | 20 +++-- examples/server-async/utils/utils.py | 14 +-- 7 files changed, 122 insertions(+), 87 deletions(-) diff --git a/examples/server-async/Pipelines.py b/examples/server-async/Pipelines.py index c30669d26e99..f89cac6a7e4b 100644 --- a/examples/server-async/Pipelines.py +++ b/examples/server-async/Pipelines.py @@ -1,13 +1,17 @@ -from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline -import torch -import os import logging -from pydantic import BaseModel -from dataclasses import dataclass, field +import os +from dataclasses import dataclass, field from typing import List +import torch +from pydantic import BaseModel + +from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline + + logger = logging.getLogger(__name__) + class TextToImageInput(BaseModel): model: str prompt: str @@ -17,8 +21,15 @@ class TextToImageInput(BaseModel): @dataclass class PresetModels: - SD3: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3-medium']) - SD3_5: List[str] = field(default_factory=lambda: ['stabilityai/stable-diffusion-3.5-large', 'stabilityai/stable-diffusion-3.5-large-turbo', 'stabilityai/stable-diffusion-3.5-medium']) + SD3: List[str] = field(default_factory=lambda: ["stabilityai/stable-diffusion-3-medium"]) + SD3_5: List[str] = field( + default_factory=lambda: [ + "stabilityai/stable-diffusion-3.5-large", + "stabilityai/stable-diffusion-3.5-large-turbo", + "stabilityai/stable-diffusion-3.5-medium", + ] + ) + class TextToImagePipelineSD3: def __init__(self, model_path: str | None = None): @@ -46,8 +57,9 @@ def start(self): else: raise Exception("No CUDA or MPS device available") + class ModelPipelineInitializer: - def __init__(self, model: str = '', type_models: str = 't2im'): + def __init__(self, model: str = "", type_models: str = "t2im"): self.model = model self.type_models = type_models self.pipeline = None @@ -68,12 +80,12 @@ def initialize_pipeline(self): self.model_type = "SD3_5" # Create appropriate pipeline based on model type and type_models - if self.type_models == 't2im': + if self.type_models == "t2im": if self.model_type in ["SD3", "SD3_5"]: self.pipeline = TextToImagePipelineSD3(self.model) else: raise ValueError(f"Model type {self.model_type} not supported for text-to-image") - elif self.type_models == 't2v': + elif self.type_models == "t2v": raise ValueError(f"Unsupported type_models: {self.type_models}") - return self.pipeline \ No newline at end of file + return self.pipeline diff --git a/examples/server-async/serverasync.py b/examples/server-async/serverasync.py index 4f114f93d63f..b279b36f9a84 100644 --- a/examples/server-async/serverasync.py +++ b/examples/server-async/serverasync.py @@ -1,41 +1,45 @@ -from fastapi import FastAPI, HTTPException, Request -from fastapi.responses import FileResponse -from fastapi.middleware.cors import CORSMiddleware -from fastapi.concurrency import run_in_threadpool -from pydantic import BaseModel -from Pipelines import ModelPipelineInitializer -from utils import Utils, RequestScopedPipeline +import asyncio +import gc import logging -import random -from dataclasses import dataclass import os -import torch +import random import threading -import gc -from typing import Optional, Dict, Any, Type from contextlib import asynccontextmanager -import asyncio +from dataclasses import dataclass +from typing import Any, Dict, Optional, Type + +import torch +from fastapi import FastAPI, HTTPException, Request +from fastapi.concurrency import run_in_threadpool +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from Pipelines import ModelPipelineInitializer +from pydantic import BaseModel + +from utils import RequestScopedPipeline, Utils @dataclass class ServerConfigModels: - model: str = 'stabilityai/stable-diffusion-3.5-medium' - type_models: str = 't2im' + model: str = "stabilityai/stable-diffusion-3.5-medium" + type_models: str = "t2im" constructor_pipeline: Optional[Type] = None - custom_pipeline: Optional[Type] = None + custom_pipeline: Optional[Type] = None components: Optional[Dict[str, Any]] = None torch_dtype: Optional[torch.dtype] = None - host: str = '0.0.0.0' + host: str = "0.0.0.0" port: int = 8500 + server_config = ServerConfigModels() + @asynccontextmanager async def lifespan(app: FastAPI): logging.basicConfig(level=logging.INFO) app.state.logger = logging.getLogger("diffusers-server") - os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True' - os.environ['CUDA_LAUNCH_BLOCKING'] = '0' + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True" + os.environ["CUDA_LAUNCH_BLOCKING"] = "0" app.state.total_requests = 0 app.state.active_inferences = 0 @@ -81,12 +85,12 @@ async def metrics_loop(): app.state.logger.info("Lifespan shutdown complete") + app = FastAPI(lifespan=lifespan) logger = logging.getLogger("DiffusersServer.Pipelines") - initializer = ModelPipelineInitializer( model=server_config.model, type_models=server_config.type_models, @@ -104,12 +108,14 @@ async def metrics_loop(): app.state.REQUEST_PIPE = request_pipe app.state.PIPELINE_LOCK = pipeline_lock + class JSONBodyQueryAPI(BaseModel): - model : str | None = None - prompt : str - negative_prompt : str | None = None - num_inference_steps : int = 28 - num_images_per_prompt : int = 1 + model: str | None = None + prompt: str + negative_prompt: str | None = None + num_inference_steps: int = 28 + num_images_per_prompt: int = 1 + @app.middleware("http") async def count_requests_middleware(request: Request, call_next): @@ -123,25 +129,24 @@ async def count_requests_middleware(request: Request, call_next): async def root(): return {"message": "Welcome to the Diffusers Server"} + @app.post("/api/diffusers/inference") async def api(json: JSONBodyQueryAPI): - prompt = json.prompt - negative_prompt = json.negative_prompt or "" - num_steps = json.num_inference_steps + prompt = json.prompt + negative_prompt = json.negative_prompt or "" + num_steps = json.num_inference_steps num_images_per_prompt = json.num_images_per_prompt - wrapper = app.state.MODEL_PIPELINE + wrapper = app.state.MODEL_PIPELINE initializer = app.state.MODEL_INITIALIZER utils_app = app.state.utils_app - if not wrapper or not wrapper.pipeline: raise HTTPException(500, "Model not initialized correctly") if not prompt.strip(): raise HTTPException(400, "No prompt provided") - def make_generator(): g = torch.Generator(device=initializer.device) return g.manual_seed(random.randint(0, 10_000_000)) @@ -168,7 +173,7 @@ def infer(): async with app.state.metrics_lock: app.state.active_inferences = max(0, app.state.active_inferences - 1) - + urls = [utils_app.save_image(img) for img in output.images] return {"response": urls} @@ -195,27 +200,25 @@ async def serve_image(filename: str): raise HTTPException(status_code=404, detail="Image not found") return FileResponse(file_path, media_type="image/png") + @app.get("/api/status") async def get_status(): memory_info = {} if torch.cuda.is_available(): memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB - memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB + memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB memory_info = { "memory_allocated_gb": round(memory_allocated, 2), "memory_reserved_gb": round(memory_reserved, 2), - "device": torch.cuda.get_device_name(0) + "device": torch.cuda.get_device_name(0), } - return { - "current_model" : server_config.model, - "type_models" : server_config.type_models, - "memory" : memory_info} - + return {"current_model": server_config.model, "type_models": server_config.type_models, "memory": memory_info} + app.add_middleware( CORSMiddleware, - allow_origins=["*"], + allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], @@ -224,4 +227,4 @@ async def get_status(): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host=server_config.host, port=server_config.port) \ No newline at end of file + uvicorn.run(app, host=server_config.host, port=server_config.port) diff --git a/examples/server-async/test.py b/examples/server-async/test.py index 2c27146d0bd0..e67317ea8f6b 100644 --- a/examples/server-async/test.py +++ b/examples/server-async/test.py @@ -1,8 +1,10 @@ import os import time import urllib.parse + import requests + SERVER_URL = "http://localhost:8500/api/diffusers/inference" BASE_URL = "http://localhost:8500" DOWNLOAD_FOLDER = "generated_images" @@ -10,6 +12,7 @@ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True) + def save_from_url(url: str) -> str: """Download the given URL (relative or absolute) and save it locally.""" if url.startswith("/"): @@ -24,11 +27,12 @@ def save_from_url(url: str) -> str: f.write(resp.content) return path + def main(): payload = { "prompt": "The T-800 Terminator Robot Returning From The Future, Anime Style", "num_inference_steps": 30, - "num_images_per_prompt": 1 + "num_images_per_prompt": 1, } print("Sending request...") @@ -56,5 +60,6 @@ def main(): except Exception as e: print(f"Error downloading {u}: {e}") + if __name__ == "__main__": main() diff --git a/examples/server-async/utils/__init__.py b/examples/server-async/utils/__init__.py index 741cd9bb0219..731cfe491ae5 100644 --- a/examples/server-async/utils/__init__.py +++ b/examples/server-async/utils/__init__.py @@ -1,2 +1,2 @@ from .requestscopedpipeline import RequestScopedPipeline -from .utils import Utils \ No newline at end of file +from .utils import Utils diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py index 79f79e28f5e7..57d1e2567169 100644 --- a/examples/server-async/utils/requestscopedpipeline.py +++ b/examples/server-async/utils/requestscopedpipeline.py @@ -1,17 +1,22 @@ -from typing import Optional, Any, Iterable, List import copy import threading +from typing import Any, Iterable, List, Optional + import torch + from diffusers.utils import logging + from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps logger = logging.get_logger(__name__) + def safe_tokenize(tokenizer, *args, lock, **kwargs): with lock: return tokenizer(*args, **kwargs) + class RequestScopedPipeline: DEFAULT_MUTABLE_ATTRS = [ "_all_hooks", @@ -30,7 +35,7 @@ def __init__( auto_detect_mutables: bool = True, tensor_numel_threshold: int = 1_000_000, tokenizer_lock: Optional[threading.Lock] = None, - wrap_scheduler: bool = True + wrap_scheduler: bool = True, ): self._base = pipeline self.unet = getattr(pipeline, "unet", None) @@ -38,7 +43,7 @@ def __init__( self.text_encoder = getattr(pipeline, "text_encoder", None) self.components = getattr(pipeline, "components", None) - if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None: + if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None: if not isinstance(pipeline.scheduler, BaseAsyncScheduler): pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler) @@ -62,9 +67,7 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] try: return wrapped_scheduler.clone_for_request( - num_inference_steps=num_inference_steps, - device=device, - **clone_kwargs + num_inference_steps=num_inference_steps, device=device, **clone_kwargs ) except Exception as e: logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") @@ -72,7 +75,7 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] return copy.deepcopy(wrapped_scheduler) except Exception as e: logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") - return wrapped_scheduler + return wrapped_scheduler def _autodetect_mutables(self, max_attrs: int = 40): if not self._auto_detect_mutables: @@ -140,7 +143,9 @@ def _clone_mutable_attrs(self, base, local): attrs_to_clone = list(self._mutable_attrs) attrs_to_clone.extend(self._autodetect_mutables()) - EXCLUDE_ATTRS = {"components",} + EXCLUDE_ATTRS = { + "components", + } for attr in attrs_to_clone: if attr in EXCLUDE_ATTRS: @@ -188,16 +193,16 @@ def _clone_mutable_attrs(self, base, local): def _is_tokenizer_component(self, component) -> bool: if component is None: return False - - tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] + + tokenizer_methods = ["encode", "decode", "tokenize", "__call__"] has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) - + class_name = component.__class__.__name__.lower() - has_tokenizer_in_name = 'tokenizer' in class_name - - tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] + has_tokenizer_in_name = "tokenizer" in class_name + + tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"] has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) - + return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): @@ -216,7 +221,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = num_inference_steps=num_inference_steps, device=device, return_scheduler=True, - **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']} + **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]}, ) final_scheduler = BaseAsyncScheduler(configured_scheduler) @@ -238,7 +243,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = setattr( local_pipe, name, - lambda *args, tok=tok, **kwargs: safe_tokenize(tok, *args, lock=self._tokenizer_lock, **kwargs) + lambda *args, tok=tok, **kwargs: safe_tokenize( + tok, *args, lock=self._tokenizer_lock, **kwargs + ), ) # b) wrap tokenizers in components dict @@ -246,7 +253,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = for key, val in local_pipe.components.items(): if val is None: continue - + if self._is_tokenizer_component(val): tokenizer_wrappers[f"components[{key}]"] = val local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize( @@ -281,9 +288,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = try: for name, tok in tokenizer_wrappers.items(): if name.startswith("components["): - key = name[len("components["):-1] + key = name[len("components[") : -1] local_pipe.components[key] = tok else: setattr(local_pipe, name, tok) except Exception as e: - logger.debug(f"Error restoring wrapped tokenizers: {e}") \ No newline at end of file + logger.debug(f"Error restoring wrapped tokenizers: {e}") diff --git a/examples/server-async/utils/scheduler.py b/examples/server-async/utils/scheduler.py index 5925edfeab04..86d47cac6154 100644 --- a/examples/server-async/utils/scheduler.py +++ b/examples/server-async/utils/scheduler.py @@ -1,7 +1,9 @@ -from typing import Any, Optional, Union, List -import torch import copy import inspect +from typing import Any, List, Optional, Union + +import torch + class BaseAsyncScheduler: def __init__(self, scheduler: Any): @@ -11,12 +13,12 @@ def __getattr__(self, name: str): if hasattr(self.scheduler, name): return getattr(self.scheduler, name) raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") - + def __setattr__(self, name: str, value): - if name == 'scheduler': + if name == "scheduler": super().__setattr__(name, value) else: - if hasattr(self, 'scheduler') and hasattr(self.scheduler, name): + if hasattr(self, "scheduler") and hasattr(self.scheduler, name): setattr(self.scheduler, name, value) else: super().__setattr__(name, value) @@ -29,7 +31,7 @@ def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.d def __repr__(self): return f"BaseAsyncScheduler({repr(self.scheduler)})" - + def __str__(self): return f"BaseAsyncScheduler wrapping: {str(self.scheduler)}" @@ -91,7 +93,9 @@ def async_retrieve_timesteps( if hasattr(scheduler, "clone_for_request"): try: # clone_for_request may accept num_inference_steps or other kwargs; be permissive - scheduler_in_use = scheduler.clone_for_request(num_inference_steps=num_inference_steps or 0, device=device) + scheduler_in_use = scheduler.clone_for_request( + num_inference_steps=num_inference_steps or 0, device=device + ) except Exception: scheduler_in_use = copy.deepcopy(scheduler) else: @@ -134,4 +138,4 @@ def _accepts(param_name: str) -> bool: if return_scheduler: return timesteps_out, num_inference_steps, scheduler_in_use - return timesteps_out, num_inference_steps \ No newline at end of file + return timesteps_out, num_inference_steps diff --git a/examples/server-async/utils/utils.py b/examples/server-async/utils/utils.py index e3dbb45677e1..9f943305126c 100644 --- a/examples/server-async/utils/utils.py +++ b/examples/server-async/utils/utils.py @@ -1,14 +1,17 @@ +import gc +import logging import os import tempfile -import torch import uuid -import gc -import logging + +import torch + logger = logging.getLogger(__name__) + class Utils: - def __init__(self, host: str = '0.0.0.0', port: int = 8500): + def __init__(self, host: str = "0.0.0.0", port: int = 8500): self.service_url = f"http://{host}:{port}" self.image_dir = os.path.join(tempfile.gettempdir(), "images") if not os.path.exists(self.image_dir): @@ -27,6 +30,7 @@ def save_image(self, image): if isinstance(image, torch.Tensor): from torchvision import transforms + to_pil = transforms.ToPILImage() image = to_pil(image.squeeze(0).clamp(0, 1)) @@ -41,4 +45,4 @@ def save_image(self, image): if torch.cuda.is_available(): torch.cuda.empty_cache() - return os.path.join(self.service_url, "images", filename) \ No newline at end of file + return os.path.join(self.service_url, "images", filename) From f2e9f0242db51de66e7692f5adcefa9b1b7c1435 Mon Sep 17 00:00:00 2001 From: Fredy Rivera Date: Mon, 20 Oct 2025 18:11:52 -0600 Subject: [PATCH 32/34] Add thread-safe wrappers for components in pipeline Refactor requestscopedpipeline.py to add thread-safe wrappers for tokenizer, VAE, and image processor. Introduce locking mechanisms to ensure thread safety during concurrent access. --- .../utils/requestscopedpipeline.py | 242 +++++++++++++----- 1 file changed, 173 insertions(+), 69 deletions(-) diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py index 57d1e2567169..0b0e73ca04e2 100644 --- a/examples/server-async/utils/requestscopedpipeline.py +++ b/examples/server-async/utils/requestscopedpipeline.py @@ -1,26 +1,92 @@ +from typing import Optional, Any, Iterable, List import copy import threading -from typing import Any, Iterable, List, Optional - import torch - from diffusers.utils import logging - from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps - logger = logging.get_logger(__name__) +class ThreadSafeTokenizerWrapper: + def __init__(self, tokenizer, lock): + self._tokenizer = tokenizer + self._lock = lock -def safe_tokenize(tokenizer, *args, lock, **kwargs): - with lock: - return tokenizer(*args, **kwargs) - + self._thread_safe_methods = { + '__call__', 'encode', 'decode', 'tokenize', + 'encode_plus', 'batch_encode_plus', 'batch_decode' + } + + def __getattr__(self, name): + attr = getattr(self._tokenizer, name) + + if name in self._thread_safe_methods and callable(attr): + def wrapped_method(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped_method + + return attr + + def __call__(self, *args, **kwargs): + with self._lock: + return self._tokenizer(*args, **kwargs) + + def __setattr__(self, name, value): + if name.startswith('_'): + super().__setattr__(name, value) + else: + setattr(self._tokenizer, name, value) + + def __dir__(self): + return dir(self._tokenizer) + + +class ThreadSafeVAEWrapper: + def __init__(self, vae, lock): + self._vae = vae + self._lock = lock + + def __getattr__(self, name): + attr = getattr(self._vae, name) + # métodos que queremos proteger + if name in {"decode", "encode", "forward"} and callable(attr): + def wrapped(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped + return attr + + def __setattr__(self, name, value): + if name.startswith("_"): + super().__setattr__(name, value) + else: + setattr(self._vae, name, value) + +class ThreadSafeImageProcessorWrapper: + def __init__(self, proc, lock): + self._proc = proc + self._lock = lock + + def __getattr__(self, name): + attr = getattr(self._proc, name) + if name in {"postprocess", "preprocess"} and callable(attr): + def wrapped(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped + return attr + + def __setattr__(self, name, value): + if name.startswith("_"): + super().__setattr__(name, value) + else: + setattr(self._proc, name, value) class RequestScopedPipeline: DEFAULT_MUTABLE_ATTRS = [ "_all_hooks", - "_offload_device", + "_offload_device", "_progress_bar_config", "_progress_bar", "_rng_state", @@ -38,23 +104,43 @@ def __init__( wrap_scheduler: bool = True, ): self._base = pipeline + + self.unet = getattr(pipeline, "unet", None) - self.vae = getattr(pipeline, "vae", None) + self.vae = getattr(pipeline, "vae", None) self.text_encoder = getattr(pipeline, "text_encoder", None) self.components = getattr(pipeline, "components", None) - - if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None: + + self.transformer = getattr(pipeline, "transformer", None) + + if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None: if not isinstance(pipeline.scheduler, BaseAsyncScheduler): pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler) self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) + + self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() + self._vae_lock = threading.Lock() + self._image_lock = threading.Lock() + self._auto_detect_mutables = bool(auto_detect_mutables) self._tensor_numel_threshold = int(tensor_numel_threshold) - self._auto_detected_attrs: List[str] = [] + def _detect_kernel_pipeline(self, pipeline) -> bool: + kernel_indicators = [ + 'text_encoding_cache', + 'memory_manager', + 'enable_optimizations', + '_create_request_context', + 'get_optimization_stats' + ] + + return any(hasattr(pipeline, attr) for attr in kernel_indicators) + + def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs): base_sched = getattr(self._base, "scheduler", None) if base_sched is None: @@ -67,15 +153,25 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] try: return wrapped_scheduler.clone_for_request( - num_inference_steps=num_inference_steps, device=device, **clone_kwargs + num_inference_steps=num_inference_steps, + device=device, + **clone_kwargs ) except Exception as e: - logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()") + logger.debug(f"clone_for_request failed: {e}; trying shallow copy fallback") try: - return copy.deepcopy(wrapped_scheduler) - except Exception as e: - logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).") - return wrapped_scheduler + if hasattr(wrapped_scheduler, 'scheduler'): + try: + copied_scheduler = copy.copy(wrapped_scheduler.scheduler) + return BaseAsyncScheduler(copied_scheduler) + except Exception: + return wrapped_scheduler + else: + copied_scheduler = copy.copy(wrapped_scheduler) + return BaseAsyncScheduler(copied_scheduler) + except Exception as e2: + logger.warning(f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*).") + return wrapped_scheduler def _autodetect_mutables(self, max_attrs: int = 40): if not self._auto_detect_mutables: @@ -86,6 +182,8 @@ def _autodetect_mutables(self, max_attrs: int = 40): candidates: List[str] = [] seen = set() + + for name in dir(self._base): if name.startswith("__"): continue @@ -93,6 +191,7 @@ def _autodetect_mutables(self, max_attrs: int = 40): continue if name in ("to", "save_pretrained", "from_pretrained"): continue + try: val = getattr(self._base, name) except Exception: @@ -100,11 +199,9 @@ def _autodetect_mutables(self, max_attrs: int = 40): import types - # skip callables and modules if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)): continue - # containers -> candidate if isinstance(val, (dict, list, set, tuple, bytearray)): candidates.append(name) seen.add(name) @@ -143,9 +240,7 @@ def _clone_mutable_attrs(self, base, local): attrs_to_clone = list(self._mutable_attrs) attrs_to_clone.extend(self._autodetect_mutables()) - EXCLUDE_ATTRS = { - "components", - } + EXCLUDE_ATTRS = {"components",} for attr in attrs_to_clone: if attr in EXCLUDE_ATTRS: @@ -193,18 +288,21 @@ def _clone_mutable_attrs(self, base, local): def _is_tokenizer_component(self, component) -> bool: if component is None: return False - - tokenizer_methods = ["encode", "decode", "tokenize", "__call__"] + + tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) - + class_name = component.__class__.__name__.lower() - has_tokenizer_in_name = "tokenizer" in class_name - - tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"] + has_tokenizer_in_name = 'tokenizer' in class_name + + tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) - + return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) + def _should_wrap_tokenizers(self) -> bool: + return True + def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs): local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device) @@ -214,6 +312,15 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).") local_pipe = copy.deepcopy(self._base) + try: + if hasattr(local_pipe, "vae") and local_pipe.vae is not None and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper): + local_pipe.vae = ThreadSafeVAEWrapper(local_pipe.vae, self._vae_lock) + + if hasattr(local_pipe, "image_processor") and local_pipe.image_processor is not None and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper): + local_pipe.image_processor = ThreadSafeImageProcessorWrapper(local_pipe.image_processor, self._image_lock) + except Exception as e: + logger.debug(f"Could not wrap vae/image_processor: {e}") + if local_scheduler is not None: try: timesteps, num_steps, configured_scheduler = async_retrieve_timesteps( @@ -221,7 +328,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = num_inference_steps=num_inference_steps, device=device, return_scheduler=True, - **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]}, + **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']} ) final_scheduler = BaseAsyncScheduler(configured_scheduler) @@ -230,48 +337,45 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") self._clone_mutable_attrs(self._base, local_pipe) + - # 4) wrap tokenizers on the local pipe with the lock wrapper - tokenizer_wrappers = {} # name -> original_tokenizer - try: - # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...) - for name in dir(local_pipe): - if "tokenizer" in name and not name.startswith("_"): - tok = getattr(local_pipe, name, None) - if tok is not None and self._is_tokenizer_component(tok): - tokenizer_wrappers[name] = tok - setattr( - local_pipe, - name, - lambda *args, tok=tok, **kwargs: safe_tokenize( - tok, *args, lock=self._tokenizer_lock, **kwargs - ), - ) - - # b) wrap tokenizers in components dict - if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): - for key, val in local_pipe.components.items(): - if val is None: - continue - - if self._is_tokenizer_component(val): - tokenizer_wrappers[f"components[{key}]"] = val - local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize( - tokenizer, *args, lock=self._tokenizer_lock, **kwargs - ) + original_tokenizers = {} + + if self._should_wrap_tokenizers(): + try: + for name in dir(local_pipe): + if "tokenizer" in name and not name.startswith("_"): + tok = getattr(local_pipe, name, None) + if tok is not None and self._is_tokenizer_component(tok): + if not isinstance(tok, ThreadSafeTokenizerWrapper): + original_tokenizers[name] = tok + wrapped_tokenizer = ThreadSafeTokenizerWrapper(tok, self._tokenizer_lock) + setattr(local_pipe, name, wrapped_tokenizer) + + if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): + for key, val in local_pipe.components.items(): + if val is None: + continue + + if self._is_tokenizer_component(val): + if not isinstance(val, ThreadSafeTokenizerWrapper): + original_tokenizers[f"components[{key}]"] = val + wrapped_tokenizer = ThreadSafeTokenizerWrapper(val, self._tokenizer_lock) + local_pipe.components[key] = wrapped_tokenizer - except Exception as e: - logger.debug(f"Tokenizer wrapping step encountered an error: {e}") + except Exception as e: + logger.debug(f"Tokenizer wrapping step encountered an error: {e}") result = None cm = getattr(local_pipe, "model_cpu_offload_context", None) + try: + if callable(cm): try: with cm(): result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) except TypeError: - # cm might be a context manager instance rather than callable try: with cm: result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) @@ -279,18 +383,18 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.") result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) else: - # no offload context available — call directly result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs) return result finally: try: - for name, tok in tokenizer_wrappers.items(): + for name, tok in original_tokenizers.items(): if name.startswith("components["): - key = name[len("components[") : -1] - local_pipe.components[key] = tok + key = name[len("components["):-1] + if hasattr(local_pipe, 'components') and isinstance(local_pipe.components, dict): + local_pipe.components[key] = tok else: setattr(local_pipe, name, tok) except Exception as e: - logger.debug(f"Error restoring wrapped tokenizers: {e}") + logger.debug(f"Error restoring original tokenizers: {e}") From 489da5d5139119caf915ab7ef37537ffe89c6806 Mon Sep 17 00:00:00 2001 From: F4k3r22 Date: Mon, 20 Oct 2025 18:58:34 -0600 Subject: [PATCH 33/34] Add wrappers.py --- .../utils/requestscopedpipeline.py | 77 +------------------ examples/server-async/utils/wrappers.py | 74 ++++++++++++++++++ 2 files changed, 75 insertions(+), 76 deletions(-) create mode 100644 examples/server-async/utils/wrappers.py diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py index 0b0e73ca04e2..c5acc35d5fab 100644 --- a/examples/server-async/utils/requestscopedpipeline.py +++ b/examples/server-async/utils/requestscopedpipeline.py @@ -4,85 +4,10 @@ import torch from diffusers.utils import logging from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps +from .wrappers import ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper, ThreadSafeImageProcessorWrapper logger = logging.get_logger(__name__) -class ThreadSafeTokenizerWrapper: - def __init__(self, tokenizer, lock): - self._tokenizer = tokenizer - self._lock = lock - - self._thread_safe_methods = { - '__call__', 'encode', 'decode', 'tokenize', - 'encode_plus', 'batch_encode_plus', 'batch_decode' - } - - def __getattr__(self, name): - attr = getattr(self._tokenizer, name) - - if name in self._thread_safe_methods and callable(attr): - def wrapped_method(*args, **kwargs): - with self._lock: - return attr(*args, **kwargs) - return wrapped_method - - return attr - - def __call__(self, *args, **kwargs): - with self._lock: - return self._tokenizer(*args, **kwargs) - - def __setattr__(self, name, value): - if name.startswith('_'): - super().__setattr__(name, value) - else: - setattr(self._tokenizer, name, value) - - def __dir__(self): - return dir(self._tokenizer) - - -class ThreadSafeVAEWrapper: - def __init__(self, vae, lock): - self._vae = vae - self._lock = lock - - def __getattr__(self, name): - attr = getattr(self._vae, name) - # métodos que queremos proteger - if name in {"decode", "encode", "forward"} and callable(attr): - def wrapped(*args, **kwargs): - with self._lock: - return attr(*args, **kwargs) - return wrapped - return attr - - def __setattr__(self, name, value): - if name.startswith("_"): - super().__setattr__(name, value) - else: - setattr(self._vae, name, value) - -class ThreadSafeImageProcessorWrapper: - def __init__(self, proc, lock): - self._proc = proc - self._lock = lock - - def __getattr__(self, name): - attr = getattr(self._proc, name) - if name in {"postprocess", "preprocess"} and callable(attr): - def wrapped(*args, **kwargs): - with self._lock: - return attr(*args, **kwargs) - return wrapped - return attr - - def __setattr__(self, name, value): - if name.startswith("_"): - super().__setattr__(name, value) - else: - setattr(self._proc, name, value) - class RequestScopedPipeline: DEFAULT_MUTABLE_ATTRS = [ "_all_hooks", diff --git a/examples/server-async/utils/wrappers.py b/examples/server-async/utils/wrappers.py new file mode 100644 index 000000000000..5130f175c2b1 --- /dev/null +++ b/examples/server-async/utils/wrappers.py @@ -0,0 +1,74 @@ +class ThreadSafeTokenizerWrapper: + def __init__(self, tokenizer, lock): + self._tokenizer = tokenizer + self._lock = lock + + self._thread_safe_methods = { + '__call__', 'encode', 'decode', 'tokenize', + 'encode_plus', 'batch_encode_plus', 'batch_decode' + } + + def __getattr__(self, name): + attr = getattr(self._tokenizer, name) + + if name in self._thread_safe_methods and callable(attr): + def wrapped_method(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped_method + + return attr + + def __call__(self, *args, **kwargs): + with self._lock: + return self._tokenizer(*args, **kwargs) + + def __setattr__(self, name, value): + if name.startswith('_'): + super().__setattr__(name, value) + else: + setattr(self._tokenizer, name, value) + + def __dir__(self): + return dir(self._tokenizer) + + +class ThreadSafeVAEWrapper: + def __init__(self, vae, lock): + self._vae = vae + self._lock = lock + + def __getattr__(self, name): + attr = getattr(self._vae, name) + if name in {"decode", "encode", "forward"} and callable(attr): + def wrapped(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped + return attr + + def __setattr__(self, name, value): + if name.startswith("_"): + super().__setattr__(name, value) + else: + setattr(self._vae, name, value) + +class ThreadSafeImageProcessorWrapper: + def __init__(self, proc, lock): + self._proc = proc + self._lock = lock + + def __getattr__(self, name): + attr = getattr(self._proc, name) + if name in {"postprocess", "preprocess"} and callable(attr): + def wrapped(*args, **kwargs): + with self._lock: + return attr(*args, **kwargs) + return wrapped + return attr + + def __setattr__(self, name, value): + if name.startswith("_"): + super().__setattr__(name, value) + else: + setattr(self._proc, name, value) \ No newline at end of file From 581847fa6940bdb73a72752cba508be7131a0dc7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 21 Oct 2025 05:25:17 +0000 Subject: [PATCH 34/34] Apply style fixes --- .../utils/requestscopedpipeline.py | 105 ++++++++++-------- examples/server-async/utils/wrappers.py | 30 +++-- 2 files changed, 79 insertions(+), 56 deletions(-) diff --git a/examples/server-async/utils/requestscopedpipeline.py b/examples/server-async/utils/requestscopedpipeline.py index c5acc35d5fab..9c3276c31c69 100644 --- a/examples/server-async/utils/requestscopedpipeline.py +++ b/examples/server-async/utils/requestscopedpipeline.py @@ -1,17 +1,22 @@ -from typing import Optional, Any, Iterable, List import copy import threading +from typing import Any, Iterable, List, Optional + import torch + from diffusers.utils import logging + from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps -from .wrappers import ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper, ThreadSafeImageProcessorWrapper +from .wrappers import ThreadSafeImageProcessorWrapper, ThreadSafeTokenizerWrapper, ThreadSafeVAEWrapper + logger = logging.get_logger(__name__) + class RequestScopedPipeline: DEFAULT_MUTABLE_ATTRS = [ "_all_hooks", - "_offload_device", + "_offload_device", "_progress_bar_config", "_progress_bar", "_rng_state", @@ -29,42 +34,39 @@ def __init__( wrap_scheduler: bool = True, ): self._base = pipeline - - + self.unet = getattr(pipeline, "unet", None) - self.vae = getattr(pipeline, "vae", None) + self.vae = getattr(pipeline, "vae", None) self.text_encoder = getattr(pipeline, "text_encoder", None) self.components = getattr(pipeline, "components", None) - + self.transformer = getattr(pipeline, "transformer", None) - - if wrap_scheduler and hasattr(pipeline, 'scheduler') and pipeline.scheduler is not None: + + if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None: if not isinstance(pipeline.scheduler, BaseAsyncScheduler): pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler) self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS) - - + self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock() self._vae_lock = threading.Lock() self._image_lock = threading.Lock() - + self._auto_detect_mutables = bool(auto_detect_mutables) self._tensor_numel_threshold = int(tensor_numel_threshold) self._auto_detected_attrs: List[str] = [] def _detect_kernel_pipeline(self, pipeline) -> bool: kernel_indicators = [ - 'text_encoding_cache', - 'memory_manager', - 'enable_optimizations', - '_create_request_context', - 'get_optimization_stats' + "text_encoding_cache", + "memory_manager", + "enable_optimizations", + "_create_request_context", + "get_optimization_stats", ] - - return any(hasattr(pipeline, attr) for attr in kernel_indicators) + return any(hasattr(pipeline, attr) for attr in kernel_indicators) def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs): base_sched = getattr(self._base, "scheduler", None) @@ -78,14 +80,12 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] try: return wrapped_scheduler.clone_for_request( - num_inference_steps=num_inference_steps, - device=device, - **clone_kwargs + num_inference_steps=num_inference_steps, device=device, **clone_kwargs ) except Exception as e: logger.debug(f"clone_for_request failed: {e}; trying shallow copy fallback") try: - if hasattr(wrapped_scheduler, 'scheduler'): + if hasattr(wrapped_scheduler, "scheduler"): try: copied_scheduler = copy.copy(wrapped_scheduler.scheduler) return BaseAsyncScheduler(copied_scheduler) @@ -95,8 +95,10 @@ def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] copied_scheduler = copy.copy(wrapped_scheduler) return BaseAsyncScheduler(copied_scheduler) except Exception as e2: - logger.warning(f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*).") - return wrapped_scheduler + logger.warning( + f"Shallow copy of scheduler also failed: {e2}. Using original scheduler (*thread-unsafe but functional*)." + ) + return wrapped_scheduler def _autodetect_mutables(self, max_attrs: int = 40): if not self._auto_detect_mutables: @@ -107,8 +109,7 @@ def _autodetect_mutables(self, max_attrs: int = 40): candidates: List[str] = [] seen = set() - - + for name in dir(self._base): if name.startswith("__"): continue @@ -116,7 +117,7 @@ def _autodetect_mutables(self, max_attrs: int = 40): continue if name in ("to", "save_pretrained", "from_pretrained"): continue - + try: val = getattr(self._base, name) except Exception: @@ -165,7 +166,9 @@ def _clone_mutable_attrs(self, base, local): attrs_to_clone = list(self._mutable_attrs) attrs_to_clone.extend(self._autodetect_mutables()) - EXCLUDE_ATTRS = {"components",} + EXCLUDE_ATTRS = { + "components", + } for attr in attrs_to_clone: if attr in EXCLUDE_ATTRS: @@ -213,16 +216,16 @@ def _clone_mutable_attrs(self, base, local): def _is_tokenizer_component(self, component) -> bool: if component is None: return False - - tokenizer_methods = ['encode', 'decode', 'tokenize', '__call__'] + + tokenizer_methods = ["encode", "decode", "tokenize", "__call__"] has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods) - + class_name = component.__class__.__name__.lower() - has_tokenizer_in_name = 'tokenizer' in class_name - - tokenizer_attrs = ['vocab_size', 'pad_token', 'eos_token', 'bos_token'] + has_tokenizer_in_name = "tokenizer" in class_name + + tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"] has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs) - + return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs) def _should_wrap_tokenizers(self) -> bool: @@ -238,11 +241,21 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = local_pipe = copy.deepcopy(self._base) try: - if hasattr(local_pipe, "vae") and local_pipe.vae is not None and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper): + if ( + hasattr(local_pipe, "vae") + and local_pipe.vae is not None + and not isinstance(local_pipe.vae, ThreadSafeVAEWrapper) + ): local_pipe.vae = ThreadSafeVAEWrapper(local_pipe.vae, self._vae_lock) - if hasattr(local_pipe, "image_processor") and local_pipe.image_processor is not None and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper): - local_pipe.image_processor = ThreadSafeImageProcessorWrapper(local_pipe.image_processor, self._image_lock) + if ( + hasattr(local_pipe, "image_processor") + and local_pipe.image_processor is not None + and not isinstance(local_pipe.image_processor, ThreadSafeImageProcessorWrapper) + ): + local_pipe.image_processor = ThreadSafeImageProcessorWrapper( + local_pipe.image_processor, self._image_lock + ) except Exception as e: logger.debug(f"Could not wrap vae/image_processor: {e}") @@ -253,7 +266,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = num_inference_steps=num_inference_steps, device=device, return_scheduler=True, - **{k: v for k, v in kwargs.items() if k in ['timesteps', 'sigmas']} + **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]}, ) final_scheduler = BaseAsyncScheduler(configured_scheduler) @@ -262,10 +275,9 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.") self._clone_mutable_attrs(self._base, local_pipe) - original_tokenizers = {} - + if self._should_wrap_tokenizers(): try: for name in dir(local_pipe): @@ -281,7 +293,7 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = for key, val in local_pipe.components.items(): if val is None: continue - + if self._is_tokenizer_component(val): if not isinstance(val, ThreadSafeTokenizerWrapper): original_tokenizers[f"components[{key}]"] = val @@ -293,9 +305,8 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = result = None cm = getattr(local_pipe, "model_cpu_offload_context", None) - + try: - if callable(cm): try: with cm(): @@ -316,8 +327,8 @@ def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = try: for name, tok in original_tokenizers.items(): if name.startswith("components["): - key = name[len("components["):-1] - if hasattr(local_pipe, 'components') and isinstance(local_pipe.components, dict): + key = name[len("components[") : -1] + if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict): local_pipe.components[key] = tok else: setattr(local_pipe, name, tok) diff --git a/examples/server-async/utils/wrappers.py b/examples/server-async/utils/wrappers.py index 5130f175c2b1..1e8474eabf3f 100644 --- a/examples/server-async/utils/wrappers.py +++ b/examples/server-async/utils/wrappers.py @@ -4,31 +4,38 @@ def __init__(self, tokenizer, lock): self._lock = lock self._thread_safe_methods = { - '__call__', 'encode', 'decode', 'tokenize', - 'encode_plus', 'batch_encode_plus', 'batch_decode' + "__call__", + "encode", + "decode", + "tokenize", + "encode_plus", + "batch_encode_plus", + "batch_decode", } - + def __getattr__(self, name): attr = getattr(self._tokenizer, name) - + if name in self._thread_safe_methods and callable(attr): + def wrapped_method(*args, **kwargs): with self._lock: return attr(*args, **kwargs) + return wrapped_method - + return attr def __call__(self, *args, **kwargs): with self._lock: return self._tokenizer(*args, **kwargs) - + def __setattr__(self, name, value): - if name.startswith('_'): + if name.startswith("_"): super().__setattr__(name, value) else: setattr(self._tokenizer, name, value) - + def __dir__(self): return dir(self._tokenizer) @@ -41,9 +48,11 @@ def __init__(self, vae, lock): def __getattr__(self, name): attr = getattr(self._vae, name) if name in {"decode", "encode", "forward"} and callable(attr): + def wrapped(*args, **kwargs): with self._lock: return attr(*args, **kwargs) + return wrapped return attr @@ -53,6 +62,7 @@ def __setattr__(self, name, value): else: setattr(self._vae, name, value) + class ThreadSafeImageProcessorWrapper: def __init__(self, proc, lock): self._proc = proc @@ -61,9 +71,11 @@ def __init__(self, proc, lock): def __getattr__(self, name): attr = getattr(self._proc, name) if name in {"postprocess", "preprocess"} and callable(attr): + def wrapped(*args, **kwargs): with self._lock: return attr(*args, **kwargs) + return wrapped return attr @@ -71,4 +83,4 @@ def __setattr__(self, name, value): if name.startswith("_"): super().__setattr__(name, value) else: - setattr(self._proc, name, value) \ No newline at end of file + setattr(self._proc, name, value)