使用したPC
Ubuntu 24.04 Intel Arc A770
Python環境構築
IPEXを使いました。IPEXの導入方法はこちら。
python -m pip install torch==2.5.1+cxx11.abi intel-extension-for-pytorch==2.5.10+xpu oneccl_bind_pt==2.5.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ pip install diffusers[torch] pip install transformers protobuf sentencepiece
方法
4bit、8bitへの量子化は行わずにいくつかの方法で実行しました。enable_model_cpu_offload(device="xpu")を使う方法
import torch from diffusers import StableDiffusion3Pipeline from decorator import time_monitor def print_memory(): max_memory = round(torch.xpu.max_memory_allocated() / 1024**3, 2) max_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 2) print(f"{max_memory=} GB") print(f"{max_reserved=} GB") @time_monitor def main(): pipe = StableDiffusion3Pipeline.from_pretrained( "stable-diffusion-3.5-large", torch_dtype=torch.bfloat16 ) pipe.enable_sequential_cpu_offload(device="xpu") seed = 20241023 image = pipe( "A capybara holding a sign that reads Hello World", num_inference_steps=28, guidance_scale=3.5, generator = torch.Generator().manual_seed(seed) ).images[0] image.save(f"enable_sequential_cpu_offload.jpg") print_memory() if __name__ == "__main__": main()
結果
max_memory=5.95 GB max_reserved=6.2 GB time: 220.25 sec
Text Encoderを分離させる方法
Text Encoderを分離させてそこだけはGPUで計算させました。しかし、それほど早くはなりませんでした。
import torch from diffusers import StableDiffusion3Pipeline from decorator import time_monitor import gc def reset_memory(): gc.collect() torch.xpu.empty_cache() torch.xpu.reset_accumulated_memory_stats() torch.xpu.reset_peak_memory_stats() def print_memory(): max_memory = round(torch.xpu.max_memory_allocated() / 1024**3, 2) max_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 2) print(f"{max_memory=} GB") print(f"{max_reserved=} GB") @time_monitor def main(): pipe = StableDiffusion3Pipeline.from_pretrained( "stable-diffusion-3.5-large", transformer=None, vae=None, torch_dtype=torch.bfloat16 ).to("xpu") with torch.no_grad(): prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds = pipe.encode_prompt( prompt="A capybara holding a sign that reads Hello World", prompt_2=None, prompt_3=None ) print("text_encoder: ") print_memory() del pipe reset_memory() pipe = StableDiffusion3Pipeline.from_pretrained( "stable-diffusion-3.5-large", text_encoder=None, text_encoder_2=None, text_encoder_3=None, tokenizer=None, tokenizer_2=None, tokenizer_3=None, torch_dtype=torch.bfloat16 ) pipe.enable_sequential_cpu_offload(device="xpu") seed = 20241023 image = pipe( prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, num_inference_steps=28, guidance_scale=3.5, generator = torch.Generator().manual_seed(seed) ).images[0] image.save(f"separate.jpg") print("transformer and vae: ") print_memory() if __name__ == "__main__": main()
text_encoder: max_memory=10.47 GB max_reserved=10.6 GB transformer and vae: max_memory=5.94 GB max_reserved=6.16 GB time: 211.01 sec
うまく行かなかった方法
- すべてGPUにのせた場合
- enable_model_cpu_offload(device="xpu")を使った場合
- device_map="balanced"を指定した場合