Tutorial 9: 计算机视觉基础
什么是计算机视觉?
计算机视觉(CV) 是让计算机”看懂”图像和视频的技术。
计算机视觉任务
├── 图像分类(这是猫还是狗?)
├── 目标检测(图中有什么?在哪里?)
├── 语义分割(每个像素属于什么类别?)
├── 实例分割(区分不同的物体实例)
├── 姿态估计(人体关键点检测)
└── 图像生成(GAN、扩散模型)
图像基础
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
# 图像是一个三维数组: [高度, 宽度, 通道]
# 通道通常是 RGB (红、绿、蓝)
# 创建一个简单的图像
img = np.zeros((100, 100, 3), dtype=np.uint8)
img[20:80, 20:80, 0] = 255 # 红色方块
# 显示
plt.imshow(img)
plt.title("简单图像")
plt.savefig('simple_image.png')
# PyTorch 中的图像格式: [通道, 高度, 宽度]
# 需要转换: [H, W, C] -> [C, H, W]
tensor_img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0
print(f"张量形状: {tensor_img.shape}") # [3, 100, 100]
图像预处理
from torchvision import transforms
# 常用的图像变换
transform = transforms.Compose([
transforms.Resize((224, 224)), # 调整大小
transforms.ToTensor(), # 转为张量 [0, 1]
transforms.Normalize( # 标准化
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
# 数据增强(训练时)
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224), # 随机裁剪
transforms.RandomHorizontalFlip(), # 随机水平翻转
transforms.ColorJitter( # 颜色抖动
brightness=0.2,
contrast=0.2,
saturation=0.2
),
transforms.RandomRotation(15), # 随机旋转
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
卷积神经网络(CNN)
CNN 是处理图像的核心架构。
卷积操作
import torch
import torch.nn as nn
# 卷积层
# in_channels: 输入通道数
# out_channels: 输出通道数(卷积核数量)
# kernel_size: 卷积核大小
conv = nn.Conv2d(
in_channels=3,
out_channels=16,
kernel_size=3,
stride=1,
padding=1
)
# 输入: [batch, channels, height, width]
x = torch.randn(1, 3, 224, 224)
output = conv(x)
print(f"输出形状: {output.shape}") # [1, 16, 224, 224]
# 池化层
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
pooled = maxpool(output)
print(f"池化后: {pooled.shape}") # [1, 16, 112, 112]
简单的 CNN
class SimpleCNN(nn.Module):
"""简单的卷积神经网络"""
def __init__(self, num_classes=10):
super().__init__()
# 卷积层
self.features = nn.Sequential(
# 第一个卷积块
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2), # 224 -> 112
# 第二个卷积块
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2), # 112 -> 56
# 第三个卷积块
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2), # 56 -> 28
)
# 分类器
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)), # 全局平均池化
nn.Flatten(),
nn.Linear(128, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
model = SimpleCNN(num_classes=10)
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")
经典 CNN 架构
# VGG 风格的块
def make_vgg_block(in_channels, out_channels, num_convs):
layers = []
for _ in range(num_convs):
layers.append(nn.Conv2d(in_channels, out_channels, 3, padding=1))
layers.append(nn.ReLU())
in_channels = out_channels
layers.append(nn.MaxPool2d(2, 2))
return nn.Sequential(*layers)
# ResNet 风格的残差块
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 捷径连接
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
out = torch.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # 残差连接
return torch.relu(out)
实战:CIFAR-10 图像分类
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# 1. 数据准备
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])
train_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10('./data', train=False, transform=transform_test)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=256)
# 类别名称
classes = ('飞机', '汽车', '鸟', '猫', '鹿', '狗', '青蛙', '马', '船', '卡车')
# 2. 定义网络
class CIFAR10Net(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
# Block 2
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
# Block 3
nn.Conv2d(128, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(256, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(256 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# 3. 训练
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CIFAR10Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
def train_epoch(model, loader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
total = 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = outputs.max(1)
correct += predicted.eq(labels).sum().item()
total += labels.size(0)
return total_loss / len(loader), correct / total
def test(model, loader, device):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = outputs.max(1)
correct += predicted.eq(labels).sum().item()
total += labels.size(0)
return correct / total
print("开始训练 CIFAR-10 分类器...")
for epoch in range(20):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
test_acc = test(model, test_loader, device)
scheduler.step(train_loss)
if (epoch + 1) % 5 == 0:
print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, "
f"Train Acc={train_acc:.2%}, Test Acc={test_acc:.2%}")
使用预训练模型
from torchvision import models
# 加载预训练的 ResNet
resnet = models.resnet18(pretrained=True)
# 冻结特征提取层
for param in resnet.parameters():
param.requires_grad = False
# 替换分类头
num_classes = 10
resnet.fc = nn.Linear(resnet.fc.in_features, num_classes)
# 只训练新的分类头
optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001)
# 微调(解冻部分层)
for param in resnet.layer4.parameters():
param.requires_grad = True
目标检测简介
# 目标检测输出: 边界框 + 类别
# 边界框格式: [x_min, y_min, x_max, y_max] 或 [x_center, y_center, width, height]
class SimpleDetector(nn.Module):
"""简化的目标检测器"""
def __init__(self, num_classes):
super().__init__()
# 特征提取
self.backbone = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
)
# 预测头
self.classifier = nn.Conv2d(128, num_classes, 1) # 类别
self.regressor = nn.Conv2d(128, 4, 1) # 边界框
def forward(self, x):
features = self.backbone(x)
class_pred = self.classifier(features)
bbox_pred = self.regressor(features)
return class_pred, bbox_pred
# 计算 IoU (Intersection over Union)
def iou(box1, box2):
"""计算两个边界框的 IoU"""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
关键概念总结
概念 |
解释 |
|---|---|
卷积 |
滑动窗口提取局部特征 |
池化 |
降低空间分辨率,增加感受野 |
特征图 |
卷积层的输出 |
感受野 |
输出像素对应的输入区域 |
残差连接 |
跳跃连接,解决梯度消失 |
迁移学习 |
使用预训练模型 |
下一步
在最后一个教程中,我们将学习强化学习的基础知识。