目录
一、什么是Stable Diffusion?
Stable Diffusion,或许对于许多读者而言已不陌生,但为了确保每位读者都能清晰理解,我还是简要概述一下这一概念,避免造成误解。
Stable Diffusion是一种基于扩散模型(Diffusion Model)的生成模型。其运作机制颇为独特,起始于一个纯粹的噪声图像,随后通过一个反向扩散过程,逐步削减噪声,直至生成一个目标图像。在模型的训练阶段,它学习的是如何将一个充满噪声的图像逐步转化为清晰的图像,直至输出符合预期。
Stable Diffusion是2022年横空出世的文本到图像生成模型,凭借”输入文字→输出图像”的魔法能力火爆AI圈!✨ 它不仅是开源的里程碑,更是首个将扩散模型(Diffusion Model)与多模态预训练结合的划时代作品,让普通人也能用自然语言指挥AI创作专业级图像。🎨

Stable Diffusion的特点显著:
- 高质量图像生成:它能够生成出高分辨率且细节丰富的图像,为图像生成领域带来了质的飞跃。
- 文本到图像生成:这一模型还支持通过输入文本提示来生成相应的图像。这意味着,用户只需用简单的描述性词语,就能让模型生成出符合自己想象的图像。
- 开源与易用性:作为一个开源项目,Stable Diffusion的使用非常灵活,且兼容多种编程语言和框架,如PaddlePaddle、PyTorch、TensorFlow等,为用户提供了极大的便利。
二、核心原理
核心原理拆解:当扩散模型遇见潜在空间
🔥 扩散模型:艺术的解构与重构
-
正向扩散:将清晰图片逐步加噪成”雪花屏”(马尔可夫链过程)
-
逆向去噪:训练神经网络(U-Net)从混沌中重建秩序(预测噪声并去除)
🎨 潜在空间压缩:VAE的降维魔法
-
传统扩散模型直接在像素空间运算→算力黑洞!💸
-
Stable Diffusion引入变分自编码器(VAE),将图像压缩到低维潜在空间(Latent Space),效率提升10倍+!🚀
三、代码实现
数据预处理
# 构建数据读取类
# 返回图片与标签
import paddle.vision as V
from PIL import Image
from paddle.io import Dataset, DataLoader
from tqdm import tqdm
# 数据变换
transforms = V.transforms.Compose([
V.transforms.Resize(80), # args.image_size + 1/4 *args.image_size
V.transforms.RandomResizedCrop(64, scale=(0.8, 1.0)),
V.transforms.ToTensor(),
V.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])
class TrainData(Dataset):
def __init__(self, txt_path="data.txt"):
with open(txt_path, "r") as f:
data = f.readlines()
self.image_paths = data[:-1] # 最后一行是空行,舍弃
def __getitem__(self, index):
image_path, label = self.image_paths[index].strip().split(" ")
image = Image.open(image_path)
image = transforms(image)
label = int(label)
return image, label
def __len__(self):
return len(self.image_paths)
dataset = TrainData()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
模型构建
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class EMA:
def __init__(self, beta):
super().__init__()
self.beta = beta
self.step = 0
def update_model_average(self, ma_model, current_model):
for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
old_weight, up_weight = ma_params, current_params
ma_params = self.update_average(old_weight, up_weight)
def update_average(self, old, new):
if old is None:
return new
return old * self.beta + (1 - self.beta) * new
def step_ema(self, ema_model, model, step_start_ema=1000):
if self.step
构建 Stable Diffusion 部分
import paddle
from tqdm import tqdm
from tools import *
class Diffusion:
def __init__(self, noise_steps=500, beta_start=1e-4, beta_end=0.02, img_size=256, device="cuda"):
self.noise_steps = noise_steps
self.beta_start = beta_start
self.beta_end = beta_end
self.beta = self.prepare_noise_schedule()
self.alpha = 1. - self.beta
self.alpha_hat = paddle.cumprod(self.alpha, dim=0)
self.img_size = img_size
self.device = device
def prepare_noise_schedule(self):
return paddle.linspace(self.beta_start, self.beta_end, self.noise_steps)
def noise_images(self, x, t):
sqrt_alpha_hat = paddle.sqrt(self.alpha_hat[t])[:, None, None, None]
sqrt_one_minus_alpha_hat = paddle.sqrt(1 - self.alpha_hat[t])[:, None, None, None]
Ɛ = paddle.randn(shape=x.shape)
return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * Ɛ, Ɛ
def sample_timesteps(self, n):
return paddle.randint(low=1, high=self.noise_steps, shape=(n,))
def sample(self, model, n, labels, cfg_scale=3):
model.eval()
with paddle.no_grad():
x = paddle.randn((n, 3, self.img_size, self.img_size))
for i in tqdm(reversed(range(1, self.noise_steps)), position=0, desc=f"denoising, wait for {self.noise_steps} steps ", unit=' steps',ncols=80):
t = paddle.to_tensor([i] * x.shape[0]).astype("int64")
predicted_noise = model(x, t, labels)
if cfg_scale > 0:
uncond_predicted_noise = model(x, t, None)
cfg_scale = paddle.to_tensor(cfg_scale).astype("float32")
predicted_noise = paddle.lerp(uncond_predicted_noise, predicted_noise, cfg_scale)
alpha = self.alpha[t][:, None, None, None]
alpha_hat = self.alpha_hat[t][:, None, None, None]
beta = self.beta[t][:, None, None, None]
if i > 1:
noise = paddle.randn(shape=x.shape)
else:
noise = paddle.zeros_like(x)
x = 1 / paddle.sqrt(alpha) * (x - ((1 - alpha) / (paddle.sqrt(1 - alpha_hat))) * predicted_noise) + paddle.sqrt(beta) * noise
model.train()
x = (x.clip(-1, 1) + 1) / 2
x = (x * 255)
return x
模型训练
import paddle
import paddle.nn as nn
import paddle.optimizer as optimizer
import paddle.vision.transforms as transforms
from paddle.io import DataLoader
from models import *
from dataset import TrainData
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
from tools import *
epochs = 1000
batch_size = 16
num_classes = 6
lr = 1.5e-4
image_size = 64
load_checkpoints = True
load_checkpoints_path = "checkpoints/sd_unet_330.pdparams"
dataset = TrainData()
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train():
device = "cuda" if paddle.device.is_compiled_with_cuda() else "cpu"
start_epoch = 0
if load_checkpoints:
start_epoch = get_checkponit_epoch(load_checkpoints_path)
model = UNet_conditional(num_classes=num_classes, device=device)
my_logger.info(f"Loading model from {load_checkpoints_path}")
params = paddle.load(load_checkpoints_path)
model.set_state_dict(params)
my_logger.info(f"Model loaded successfully!")
else:
my_logger.info("Training from scratch")
model = UNet_conditional(num_classes=num_classes, device=device)
opt = optimizer.Adam(learning_rate=lr, parameters=model.parameters())
mse = nn.MSELoss()
diffusion = Diffusion(img_size=image_size, device=device)
l = len(dataloader)
ema = EMA(0.995)
ema_model = copy.deepcopy(model)
ema_model.eval()
for epoch in range(start_epoch,epochs):
cache_loss = 0
pbar = tqdm(dataloader, desc=f"[{my_time.get_time()}] Epoch {epoch}", position=0, leave=True)
for i, (images, labels) in enumerate(pbar):
B = images.shape[0] # [B, C, H, W]
t = diffusion.sample_timesteps(B)
x_t, noise = diffusion.noise_images(images, t)
if np.random.random()
模型评估
import paddle
from models import *
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['font.size'] = 15
plt.rcParams['axes.unicode_minus'] = False
model = UNet_conditional(num_classes=6)
model.set_state_dict(paddle.load("checkpoints/sd_unet_490.pdparams")) # 加载模型文件
diffusion = Diffusion(img_size=64, device="cuda")
# 分别对应标签 0, 1, 2, 3, 4, 5
# 建筑物 森林 冰川 山峰 大海 街道
name = ["建筑物", "森林", "冰川", "山峰", "大海", "街道"]
labels = paddle.to_tensor([0, 1, 2, 3, 4, 5]).astype("int64")
# 标签引导强度
cfg_scale = [3,7,10]
# 假设我们想要将图像大小调整为256x256
fixed_size = (256, 256)
plt.figure(figsize=(15, 10))
for i in range(len(cfg_scale)):
sampled_images = diffusion.sample(model, n=len(labels), labels=labels, cfg_scale=cfg_scale[i])
for j in range(6):
img = sampled_images[j].transpose([1, 2, 0])
img = np.array(img).astype("uint8")
img = Image.fromarray(img).resize(fixed_size) # 调整图像大小
plt.subplot(len(cfg_scale), 6, i * 6 + j + 1)
plt.imshow(img)
plt.title(name[labels[j]])
plt.axis('off')
plt.tight_layout()
plt.show()
文章来源于互联网:用AI画出你的想象力:基于PaddlePaddle的多模态Stable Diffusion(附代码)
作为一位热衷于文学创作的青年作者,我怀揣着无限的激情与好奇,荣幸地参与了此次神秘而刺激的写作征程。在这样独特的经历中,我深深地感知到了人工智能所蕴含的强大力量。 第一站:探索未知领域 在此次行程中,我体验到了新奇独特的领域——AI文章生成工具“夸克”。这是一款…
5bei.cn大模型教程网










