#################################### Tutorial 9: 计算机视觉基础 #################################### .. include:: ../links.ref .. include:: ../tags.ref .. include:: ../abbrs.ref 什么是计算机视觉? ================== **计算机视觉(CV)** 是让计算机"看懂"图像和视频的技术。 .. code-block:: text 计算机视觉任务 ├── 图像分类(这是猫还是狗?) ├── 目标检测(图中有什么?在哪里?) ├── 语义分割(每个像素属于什么类别?) ├── 实例分割(区分不同的物体实例) ├── 姿态估计(人体关键点检测) └── 图像生成(GAN、扩散模型) 图像基础 ======== .. code-block:: python import torch import numpy as np from PIL import Image import matplotlib.pyplot as plt # 图像是一个三维数组: [高度, 宽度, 通道] # 通道通常是 RGB (红、绿、蓝) # 创建一个简单的图像 img = np.zeros((100, 100, 3), dtype=np.uint8) img[20:80, 20:80, 0] = 255 # 红色方块 # 显示 plt.imshow(img) plt.title("简单图像") plt.savefig('simple_image.png') # PyTorch 中的图像格式: [通道, 高度, 宽度] # 需要转换: [H, W, C] -> [C, H, W] tensor_img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0 print(f"张量形状: {tensor_img.shape}") # [3, 100, 100] 图像预处理 ========== .. code-block:: python from torchvision import transforms # 常用的图像变换 transform = transforms.Compose([ transforms.Resize((224, 224)), # 调整大小 transforms.ToTensor(), # 转为张量 [0, 1] transforms.Normalize( # 标准化 mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ]) # 数据增强(训练时) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), # 随机裁剪 transforms.RandomHorizontalFlip(), # 随机水平翻转 transforms.ColorJitter( # 颜色抖动 brightness=0.2, contrast=0.2, saturation=0.2 ), transforms.RandomRotation(15), # 随机旋转 transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ]) 卷积神经网络(CNN) =================== CNN 是处理图像的核心架构。 卷积操作 -------- .. code-block:: python import torch import torch.nn as nn # 卷积层 # in_channels: 输入通道数 # out_channels: 输出通道数(卷积核数量) # kernel_size: 卷积核大小 conv = nn.Conv2d( in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1 ) # 输入: [batch, channels, height, width] x = torch.randn(1, 3, 224, 224) output = conv(x) print(f"输出形状: {output.shape}") # [1, 16, 224, 224] # 池化层 maxpool = nn.MaxPool2d(kernel_size=2, stride=2) pooled = maxpool(output) print(f"池化后: {pooled.shape}") # [1, 16, 112, 112] 简单的 CNN ---------- .. code-block:: python class SimpleCNN(nn.Module): """简单的卷积神经网络""" def __init__(self, num_classes=10): super().__init__() # 卷积层 self.features = nn.Sequential( # 第一个卷积块 nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), # 224 -> 112 # 第二个卷积块 nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), # 112 -> 56 # 第三个卷积块 nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), # 56 -> 28 ) # 分类器 self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), # 全局平均池化 nn.Flatten(), nn.Linear(128, 256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, num_classes) ) def forward(self, x): x = self.features(x) x = self.classifier(x) return x model = SimpleCNN(num_classes=10) print(f"参数量: {sum(p.numel() for p in model.parameters()):,}") 经典 CNN 架构 ============= .. code-block:: python # VGG 风格的块 def make_vgg_block(in_channels, out_channels, num_convs): layers = [] for _ in range(num_convs): layers.append(nn.Conv2d(in_channels, out_channels, 3, padding=1)) layers.append(nn.ReLU()) in_channels = out_channels layers.append(nn.MaxPool2d(2, 2)) return nn.Sequential(*layers) # ResNet 风格的残差块 class ResidualBlock(nn.Module): def __init__(self, in_channels, out_channels, stride=1): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) # 捷径连接 self.shortcut = nn.Sequential() if stride != 1 or in_channels != out_channels: self.shortcut = nn.Sequential( nn.Conv2d(in_channels, out_channels, 1, stride, bias=False), nn.BatchNorm2d(out_channels) ) def forward(self, x): out = torch.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) # 残差连接 return torch.relu(out) 实战:CIFAR-10 图像分类 ======================= .. code-block:: python import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader from torchvision import datasets, transforms # 1. 数据准备 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)) ]) train_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transform_train) test_dataset = datasets.CIFAR10('./data', train=False, transform=transform_test) train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2) test_loader = DataLoader(test_dataset, batch_size=256) # 类别名称 classes = ('飞机', '汽车', '鸟', '猫', '鹿', '狗', '青蛙', '马', '船', '卡车') # 2. 定义网络 class CIFAR10Net(nn.Module): def __init__(self): super().__init__() self.features = nn.Sequential( # Block 1 nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.25), # Block 2 nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.25), # Block 3 nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.25), ) self.classifier = nn.Sequential( nn.Flatten(), nn.Linear(256 * 4 * 4, 512), nn.ReLU(), nn.Dropout(0.5), nn.Linear(512, 10) ) def forward(self, x): x = self.features(x) x = self.classifier(x) return x # 3. 训练 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = CIFAR10Net().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3) def train_epoch(model, loader, criterion, optimizer, device): model.train() total_loss = 0 correct = 0 total = 0 for images, labels in loader: images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() total_loss += loss.item() _, predicted = outputs.max(1) correct += predicted.eq(labels).sum().item() total += labels.size(0) return total_loss / len(loader), correct / total def test(model, loader, device): model.eval() correct = 0 total = 0 with torch.no_grad(): for images, labels in loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = outputs.max(1) correct += predicted.eq(labels).sum().item() total += labels.size(0) return correct / total print("开始训练 CIFAR-10 分类器...") for epoch in range(20): train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device) test_acc = test(model, test_loader, device) scheduler.step(train_loss) if (epoch + 1) % 5 == 0: print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, " f"Train Acc={train_acc:.2%}, Test Acc={test_acc:.2%}") 使用预训练模型 ============== .. code-block:: python from torchvision import models # 加载预训练的 ResNet resnet = models.resnet18(pretrained=True) # 冻结特征提取层 for param in resnet.parameters(): param.requires_grad = False # 替换分类头 num_classes = 10 resnet.fc = nn.Linear(resnet.fc.in_features, num_classes) # 只训练新的分类头 optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001) # 微调(解冻部分层) for param in resnet.layer4.parameters(): param.requires_grad = True 目标检测简介 ============ .. code-block:: python # 目标检测输出: 边界框 + 类别 # 边界框格式: [x_min, y_min, x_max, y_max] 或 [x_center, y_center, width, height] class SimpleDetector(nn.Module): """简化的目标检测器""" def __init__(self, num_classes): super().__init__() # 特征提取 self.backbone = nn.Sequential( nn.Conv2d(3, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), ) # 预测头 self.classifier = nn.Conv2d(128, num_classes, 1) # 类别 self.regressor = nn.Conv2d(128, 4, 1) # 边界框 def forward(self, x): features = self.backbone(x) class_pred = self.classifier(features) bbox_pred = self.regressor(features) return class_pred, bbox_pred # 计算 IoU (Intersection over Union) def iou(box1, box2): """计算两个边界框的 IoU""" x1 = max(box1[0], box2[0]) y1 = max(box1[1], box2[1]) x2 = min(box1[2], box2[2]) y2 = min(box1[3], box2[3]) intersection = max(0, x2 - x1) * max(0, y2 - y1) area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) union = area1 + area2 - intersection return intersection / union if union > 0 else 0 关键概念总结 ============ .. csv-table:: :header: "概念", "解释" :widths: 25, 75 "卷积", "滑动窗口提取局部特征" "池化", "降低空间分辨率,增加感受野" "特征图", "卷积层的输出" "感受野", "输出像素对应的输入区域" "残差连接", "跳跃连接,解决梯度消失" "迁移学习", "使用预训练模型" 下一步 ====== 在最后一个教程中,我们将学习强化学习的基础知识。 :doc:`tutorial_10_reinforcement_learning`