1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
| from tqdm.auto import tqdm from diffusers import UniPCMultistepScheduler from PIL import Image import torch from transformers import CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler import os
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True) tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer") text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True) unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True) scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
torch_device = "cuda" vae.to(torch_device) text_encoder.to(torch_device) unet.to(torch_device)
prompt = ["1girl, aqua eyes, baseball cap, blonde hair, closed mouth, earrings, green background, hat, hoop earrings, jewelry, looking at viewer, shirt, short hair, simple background, solo, upper body, yellow shirt,high quality,masterpiece"] height = 512 width = 512 num_inference_steps = 25 guidance_scale = 7.5
generator = torch.manual_seed(0) batch_size = len(prompt)
text_input = tokenizer( prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt" )
with torch.no_grad(): text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
max_length = text_input.input_ids.shape[-1] uncond_input = tokenizer([""] * batch_size, padding="max_length",max_length=max_length, return_tensors="pt") uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latents = torch.randn( (batch_size, unet.in_channels, height // 8, width // 8), generator=generator, ) latents = latents.to(torch_device)
latents = latents * scheduler.init_noise_sigma
scheduler.set_timesteps(num_inference_steps) for t in tqdm(scheduler.timesteps): latent_model_input = torch.cat([latents] * 2)
latent_model_input = scheduler.scale_model_input( latent_model_input, timestep=t)
with torch.no_grad(): noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * \ (noise_pred_text - noise_pred_uncond)
latents = scheduler.step(noise_pred, t, latents).prev_sample
latents = 1 / 0.18215 * latents with torch.no_grad(): image = vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1).squeeze() image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy() images = (image * 255).round().astype("uint8") image = Image.fromarray(image) output_dir = "/data1/sdtest/" output_filename = "2.1.png" output_path = os.path.join(output_dir, output_filename)
image.save(output_path) print("生成的图像已保存到:", output_path)
|