はじめに
前回の続きです。touch-sp.hatenablog.com
作成時間を短縮するために行程を分割してみました。
分割するとCPU offloadをしなくて済むのではないかという発想です。
Pythonスクリプト
import torch from diffusers import ConsisIDPipeline from diffusers.pipelines.consisid.consisid_utils import prepare_face_models, process_face_embeddings_infer from diffusers.utils import export_to_video from decorator import gpu_monitor, time_monitor, print_memory, reset_memory # model was downloaded from https://huggingface.co/BestWishYsh/ConsisID-preview @gpu_monitor(interval=0.5) @time_monitor def main(): # Image Encoder image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_input.png?download=true" face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std = prepare_face_models( "ConsisID-preview", device="cuda", dtype=torch.bfloat16 ) id_cond, id_vit_hidden, image, face_kps = process_face_embeddings_infer( face_helper_1, face_clip_model, face_helper_2, eva_transform_mean, eva_transform_std, face_main_model, "cuda", torch.bfloat16, image, is_align_face=True, ) print("Image Encoder: ") print_memory() del face_helper_1 del face_helper_2 del face_clip_model del face_main_model del eva_transform_mean del eva_transform_std reset_memory() # Text Encoder prompt = "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel." pipe = ConsisIDPipeline.from_pretrained( "ConsisID-preview", transformer=None, vae=None, torch_dtype=torch.bfloat16, ).to("cuda") with torch.no_grad(): prompt_embeds, negative_prompt_embeds = pipe.encode_prompt( prompt=prompt ) print("Text Encoder: ") print_memory() del pipe reset_memory() # Transformer and VAE pipe = ConsisIDPipeline.from_pretrained( "ConsisID-preview", text_encoder=None, tokenizer=None, torch_dtype=torch.bfloat16, ) pipe.to("cuda") pipe.vae.enable_tiling() video = pipe( image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, num_inference_steps=50, guidance_scale=6.0, use_dynamic_cfg=False, id_vit_hidden=id_vit_hidden, id_cond=id_cond, kps_cond=face_kps, generator=torch.Generator("cuda").manual_seed(42), ) export_to_video(video.frames[0], "output.mp4", fps=8) print("Transformer and VAE: ") print_memory() if __name__ == "__main__": main()
結果
あまり変わりませんでした。わずかに速くなったかもしれませんし、誤差かもしれません。
Image Encoder: max_memory=1.38 GB max_reserved=1.41 GB Text Encoder: max_memory=10.13 GB max_reserved=10.17 GB Transformer and VAE: max_memory=17.04 GB max_reserved=19.47 GB time: 308.92 sec GPU 0 - Used memory: 20.94/23.99 GB