Tutorial 6: 神经网络
什么是神经网络?
神经网络 是一种受生物神经系统启发的计算模型,由大量相互连接的”神经元”组成。
生物神经元 人工神经元
┌─────────────┐ ┌─────────────┐
│ 树突 │ │ 输入 x₁ │──┐
│ (接收信号) │ │ 输入 x₂ │──┼──► Σ ──► f(·) ──► 输出
└──────┬──────┘ │ 输入 x₃ │──┘
│ └─────────────┘
▼
┌─────────────┐ 加权求和 + 激活函数
│ 细胞体 │
│ (处理信号) │ y = f(w₁x₁ + w₂x₂ + w₃x₃ + b)
└──────┬──────┘
│
▼
┌─────────────┐
│ 轴突 │
│ (传递信号) │
└─────────────┘
感知机:最简单的神经网络
import torch
class Perceptron:
"""感知机"""
def __init__(self, input_size):
# 随机初始化权重和偏置
self.weights = torch.randn(input_size)
self.bias = torch.randn(1)
def forward(self, x):
"""前向传播"""
# 加权求和
z = torch.dot(x, self.weights) + self.bias
# 阶跃激活函数
return 1 if z > 0 else 0
def train(self, X, y, epochs=100, lr=0.1):
"""训练"""
for epoch in range(epochs):
errors = 0
for xi, yi in zip(X, y):
prediction = self.forward(xi)
error = yi - prediction
if error != 0:
# 更新权重
self.weights += lr * error * xi
self.bias += lr * error
errors += 1
if errors == 0:
print(f"Epoch {epoch}: 收敛!")
break
# 示例:学习 AND 门
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([0, 0, 0, 1]) # AND 门的输出
perceptron = Perceptron(2)
perceptron.train(X, y)
print("\nAND 门测试:")
for xi in X:
print(f" {xi.tolist()} -> {perceptron.forward(xi)}")
感知机的局限
感知机只能解决 线性可分 问题,无法解决 XOR 问题:
AND (线性可分) XOR (非线性可分)
1 ┼───────────── 1 ┼─────────────
│ · / │ · ·
│ / │ ?????
│ / │
0 ┼──·──/──·── 0 ┼──·────────·──
0 1 0 1
多层感知机(MLP)
通过增加 隐藏层,神经网络可以学习非线性函数。
import torch
import torch.nn as nn
class MLP(nn.Module):
"""多层感知机"""
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.layer1 = nn.Linear(input_size, hidden_size)
self.layer2 = nn.Linear(hidden_size, output_size)
self.activation = nn.ReLU()
def forward(self, x):
x = self.layer1(x)
x = self.activation(x)
x = self.layer2(x)
return x
# 解决 XOR 问题
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32) # XOR
model = MLP(2, 4, 1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(1000):
outputs = model(X)
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1) % 200 == 0:
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
print("\nXOR 测试:")
with torch.no_grad():
predictions = model(X)
for xi, pred in zip(X, predictions):
print(f" {xi.tolist()} -> {pred.item():.2f}")
激活函数
激活函数引入非线性,让神经网络能学习复杂函数。
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
# 常用激活函数
activations = {
'Sigmoid': nn.Sigmoid(),
'Tanh': nn.Tanh(),
'ReLU': nn.ReLU(),
'LeakyReLU': nn.LeakyReLU(0.1),
'GELU': nn.GELU()
}
x = torch.linspace(-5, 5, 100)
fig, axes = plt.subplots(1, 5, figsize=(15, 3))
for ax, (name, func) in zip(axes, activations.items()):
y = func(x)
ax.plot(x.numpy(), y.numpy())
ax.set_title(name)
ax.grid(True)
ax.axhline(y=0, color='k', linewidth=0.5)
ax.axvline(x=0, color='k', linewidth=0.5)
plt.tight_layout()
plt.savefig('activations.png')
激活函数比较
激活函数 |
公式 |
优点 |
缺点 |
|---|---|---|---|
Sigmoid |
1/(1+e^(-x)) |
输出在(0,1) |
梯度消失 |
Tanh |
(e^x-e^(-x))/(e^x+e^(-x)) |
零中心 |
梯度消失 |
ReLU |
max(0,x) |
计算简单,缓解梯度消失 |
死亡ReLU |
LeakyReLU |
max(αx,x) |
解决死亡ReLU |
需要调参α |
GELU |
x·Φ(x) |
平滑,Transformer常用 |
计算稍复杂 |
反向传播算法
反向传播是训练神经网络的核心算法,利用链式法则计算梯度。
前向传播:
输入 x ──► 隐藏层 ──► 输出 y ──► 损失 L
反向传播:
∂L/∂w ◄── ∂L/∂y ◄── ∂L/∂L = 1
import torch
# 手动实现反向传播(理解原理)
class ManualMLP:
def __init__(self, input_size, hidden_size, output_size):
# 初始化权重
self.W1 = torch.randn(input_size, hidden_size, requires_grad=False) * 0.1
self.b1 = torch.zeros(hidden_size, requires_grad=False)
self.W2 = torch.randn(hidden_size, output_size, requires_grad=False) * 0.1
self.b2 = torch.zeros(output_size, requires_grad=False)
def forward(self, x):
"""前向传播"""
# 第一层
self.z1 = x @ self.W1 + self.b1
self.a1 = torch.relu(self.z1)
# 第二层
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = self.z2 # 线性输出
return self.a2
def backward(self, x, y, y_pred, lr=0.01):
"""反向传播"""
batch_size = x.shape[0]
# 输出层梯度
dL_dz2 = (y_pred - y) / batch_size
dL_dW2 = self.a1.T @ dL_dz2
dL_db2 = dL_dz2.sum(dim=0)
# 隐藏层梯度
dL_da1 = dL_dz2 @ self.W2.T
dL_dz1 = dL_da1 * (self.z1 > 0).float() # ReLU 导数
dL_dW1 = x.T @ dL_dz1
dL_db1 = dL_dz1.sum(dim=0)
# 更新权重
self.W2 -= lr * dL_dW2
self.b2 -= lr * dL_db2
self.W1 -= lr * dL_dW1
self.b1 -= lr * dL_db1
# 使用 PyTorch 自动微分(实际使用)
class AutogradMLP(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
# PyTorch 自动计算梯度
model = AutogradMLP(2, 4, 1)
x = torch.randn(10, 2)
y = torch.randn(10, 1)
# 前向传播
y_pred = model(x)
loss = ((y_pred - y) ** 2).mean()
# 反向传播(自动)
loss.backward()
# 查看梯度
print("fc1 权重梯度形状:", model.fc1.weight.grad.shape)
print("fc2 权重梯度形状:", model.fc2.weight.grad.shape)
实战:手写数字识别
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# 1. 数据准备
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000)
# 2. 定义网络
class DigitClassifier(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.layers = nn.Sequential(
nn.Linear(28 * 28, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 10)
)
def forward(self, x):
x = self.flatten(x)
return self.layers(x)
model = DigitClassifier()
# 3. 训练配置
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 4. 训练循环
def train_epoch(model, loader, criterion, optimizer):
model.train()
total_loss = 0
correct = 0
total = 0
for data, target in loader:
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
total_loss += loss.item()
pred = output.argmax(dim=1)
correct += (pred == target).sum().item()
total += target.size(0)
return total_loss / len(loader), correct / total
def test(model, loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for data, target in loader:
output = model(data)
pred = output.argmax(dim=1)
correct += (pred == target).sum().item()
total += target.size(0)
return correct / total
# 5. 训练
print("开始训练 MNIST 分类器...")
for epoch in range(10):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
test_acc = test(model, test_loader)
print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, "
f"Train Acc={train_acc:.2%}, Test Acc={test_acc:.2%}")
# 6. 保存模型
torch.save(model.state_dict(), 'mnist_classifier.pt')
print("\n模型已保存!")
网络结构设计
# 常见的网络结构模式
# 1. 逐渐缩小(分类常用)
class Encoder(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, 10)
)
# 2. 逐渐扩大(生成常用)
class Decoder(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(10, 64),
nn.ReLU(),
nn.Linear(64, 256),
nn.ReLU(),
nn.Linear(256, 784)
)
# 3. 残差连接(深层网络)
class ResidualBlock(nn.Module):
def __init__(self, dim):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(dim, dim),
nn.ReLU(),
nn.Linear(dim, dim)
)
def forward(self, x):
return x + self.layers(x) # 残差连接
# 4. 批归一化(加速训练)
class NormalizedMLP(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(784, 256),
nn.BatchNorm1d(256),
nn.ReLU(),
nn.Linear(256, 10)
)
关键概念总结
概念 |
解释 |
|---|---|
神经元 |
接收输入、加权求和、激活输出的基本单元 |
层 |
多个神经元的集合 |
激活函数 |
引入非线性的函数 |
前向传播 |
输入到输出的计算过程 |
反向传播 |
计算梯度的算法 |
梯度下降 |
沿梯度方向更新参数 |
损失函数 |
衡量预测误差的函数 |
下一步
在下一个教程中,我们将学习如何用 PyTorch 构建深度学习模型。