Tutorial 6: 神经网络

什么是神经网络?

神经网络 是一种受生物神经系统启发的计算模型,由大量相互连接的”神经元”组成。

生物神经元                    人工神经元

┌─────────────┐              ┌─────────────┐
│  树突       │              │  输入 x₁    │──┐
│  (接收信号) │              │  输入 x₂    │──┼──► Σ ──► f(·) ──► 输出
└──────┬──────┘              │  输入 x₃    │──┘
       │                     └─────────────┘
       ▼
┌─────────────┐              加权求和 + 激活函数
│  细胞体     │
│  (处理信号) │              y = f(w₁x₁ + w₂x₂ + w₃x₃ + b)
└──────┬──────┘
       │
       ▼
┌─────────────┐
│  轴突       │
│  (传递信号) │
└─────────────┘

感知机:最简单的神经网络

import torch

class Perceptron:
    """感知机"""

    def __init__(self, input_size):
        # 随机初始化权重和偏置
        self.weights = torch.randn(input_size)
        self.bias = torch.randn(1)

    def forward(self, x):
        """前向传播"""
        # 加权求和
        z = torch.dot(x, self.weights) + self.bias
        # 阶跃激活函数
        return 1 if z > 0 else 0

    def train(self, X, y, epochs=100, lr=0.1):
        """训练"""
        for epoch in range(epochs):
            errors = 0
            for xi, yi in zip(X, y):
                prediction = self.forward(xi)
                error = yi - prediction

                if error != 0:
                    # 更新权重
                    self.weights += lr * error * xi
                    self.bias += lr * error
                    errors += 1

            if errors == 0:
                print(f"Epoch {epoch}: 收敛!")
                break

# 示例:学习 AND 门
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([0, 0, 0, 1])  # AND 门的输出

perceptron = Perceptron(2)
perceptron.train(X, y)

print("\nAND 门测试:")
for xi in X:
    print(f"  {xi.tolist()} -> {perceptron.forward(xi)}")

感知机的局限

感知机只能解决 线性可分 问题,无法解决 XOR 问题:

AND (线性可分)        XOR (非线性可分)

1 ┼─────────────      1 ┼─────────────
  │     ·  /            │  ·        ·
  │       /             │     ?????
  │      /              │
0 ┼──·──/──·──         0 ┼──·────────·──
  0     1               0     1

多层感知机(MLP)

通过增加 隐藏层,神经网络可以学习非线性函数。

import torch
import torch.nn as nn

class MLP(nn.Module):
    """多层感知机"""

    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
        return x

# 解决 XOR 问题
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)  # XOR

model = MLP(2, 4, 1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(1000):
    outputs = model(X)
    loss = criterion(outputs, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

print("\nXOR 测试:")
with torch.no_grad():
    predictions = model(X)
    for xi, pred in zip(X, predictions):
        print(f"  {xi.tolist()} -> {pred.item():.2f}")

激活函数

激活函数引入非线性,让神经网络能学习复杂函数。

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# 常用激活函数
activations = {
    'Sigmoid': nn.Sigmoid(),
    'Tanh': nn.Tanh(),
    'ReLU': nn.ReLU(),
    'LeakyReLU': nn.LeakyReLU(0.1),
    'GELU': nn.GELU()
}

x = torch.linspace(-5, 5, 100)

fig, axes = plt.subplots(1, 5, figsize=(15, 3))

for ax, (name, func) in zip(axes, activations.items()):
    y = func(x)
    ax.plot(x.numpy(), y.numpy())
    ax.set_title(name)
    ax.grid(True)
    ax.axhline(y=0, color='k', linewidth=0.5)
    ax.axvline(x=0, color='k', linewidth=0.5)

plt.tight_layout()
plt.savefig('activations.png')

激活函数比较

激活函数

公式

优点

缺点

Sigmoid

1/(1+e^(-x))

输出在(0,1)

梯度消失

Tanh

(e^x-e^(-x))/(e^x+e^(-x))

零中心

梯度消失

ReLU

max(0,x)

计算简单,缓解梯度消失

死亡ReLU

LeakyReLU

max(αx,x)

解决死亡ReLU

需要调参α

GELU

x·Φ(x)

平滑,Transformer常用

计算稍复杂

反向传播算法

反向传播是训练神经网络的核心算法,利用链式法则计算梯度。

前向传播:
输入 x ──► 隐藏层 ──► 输出 y ──► 损失 L

反向传播:
∂L/∂w ◄── ∂L/∂y ◄── ∂L/∂L = 1
import torch

# 手动实现反向传播(理解原理)
class ManualMLP:
    def __init__(self, input_size, hidden_size, output_size):
        # 初始化权重
        self.W1 = torch.randn(input_size, hidden_size, requires_grad=False) * 0.1
        self.b1 = torch.zeros(hidden_size, requires_grad=False)
        self.W2 = torch.randn(hidden_size, output_size, requires_grad=False) * 0.1
        self.b2 = torch.zeros(output_size, requires_grad=False)

    def forward(self, x):
        """前向传播"""
        # 第一层
        self.z1 = x @ self.W1 + self.b1
        self.a1 = torch.relu(self.z1)

        # 第二层
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.z2  # 线性输出

        return self.a2

    def backward(self, x, y, y_pred, lr=0.01):
        """反向传播"""
        batch_size = x.shape[0]

        # 输出层梯度
        dL_dz2 = (y_pred - y) / batch_size
        dL_dW2 = self.a1.T @ dL_dz2
        dL_db2 = dL_dz2.sum(dim=0)

        # 隐藏层梯度
        dL_da1 = dL_dz2 @ self.W2.T
        dL_dz1 = dL_da1 * (self.z1 > 0).float()  # ReLU 导数
        dL_dW1 = x.T @ dL_dz1
        dL_db1 = dL_dz1.sum(dim=0)

        # 更新权重
        self.W2 -= lr * dL_dW2
        self.b2 -= lr * dL_db2
        self.W1 -= lr * dL_dW1
        self.b1 -= lr * dL_db1

# 使用 PyTorch 自动微分(实际使用)
class AutogradMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# PyTorch 自动计算梯度
model = AutogradMLP(2, 4, 1)
x = torch.randn(10, 2)
y = torch.randn(10, 1)

# 前向传播
y_pred = model(x)
loss = ((y_pred - y) ** 2).mean()

# 反向传播(自动)
loss.backward()

# 查看梯度
print("fc1 权重梯度形状:", model.fc1.weight.grad.shape)
print("fc2 权重梯度形状:", model.fc2.weight.grad.shape)

实战:手写数字识别

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# 1. 数据准备
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000)

# 2. 定义网络
class DigitClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(28 * 28, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.layers(x)

model = DigitClassifier()

# 3. 训练配置
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 训练循环
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for data, target in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += target.size(0)

    return total_loss / len(loader), correct / total

def test(model, loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in loader:
            output = model(data)
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
            total += target.size(0)

    return correct / total

# 5. 训练
print("开始训练 MNIST 分类器...")
for epoch in range(10):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    test_acc = test(model, test_loader)
    print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, "
          f"Train Acc={train_acc:.2%}, Test Acc={test_acc:.2%}")

# 6. 保存模型
torch.save(model.state_dict(), 'mnist_classifier.pt')
print("\n模型已保存!")

网络结构设计

# 常见的网络结构模式

# 1. 逐渐缩小(分类常用)
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )

# 2. 逐渐扩大(生成常用)
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(10, 64),
            nn.ReLU(),
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, 784)
        )

# 3. 残差连接(深层网络)
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, x):
        return x + self.layers(x)  # 残差连接

# 4. 批归一化(加速训练)
class NormalizedMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(784, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )

关键概念总结

概念

解释

神经元

接收输入、加权求和、激活输出的基本单元

多个神经元的集合

激活函数

引入非线性的函数

前向传播

输入到输出的计算过程

反向传播

计算梯度的算法

梯度下降

沿梯度方向更新参数

损失函数

衡量预测误差的函数

下一步

在下一个教程中,我们将学习如何用 PyTorch 构建深度学习模型。

Tutorial 7: PyTorch 深度学习实战