Tutorial 10: 强化学习基础
什么是强化学习?
强化学习(RL) 是让智能体通过与环境交互,学习最优行为策略的方法。
┌─────────────────────────────────────────────────────────────┐
│ 强化学习循环 │
│ │
│ 状态 s_t 奖励 r_t │
│ │ ▲ │
│ ▼ │ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ 智能体 │ 动作 a_t │ 环境 │ │
│ │ (Agent) │────────────►│ (Environment)│ │
│ └─────────────┘ └─────────────┘ │
│ │
│ 目标: 最大化累积奖励 Σ γ^t × r_t │
│ │
└─────────────────────────────────────────────────────────────┘
关键概念
状态(State): 环境的当前情况
动作(Action): 智能体可以采取的操作
奖励(Reward): 环境对动作的反馈
策略(Policy): 状态到动作的映射
价值函数(Value): 状态或动作的长期价值
强化学习 vs 其他学习
学习类型 |
数据来源 |
反馈形式 |
例子 |
|---|---|---|---|
监督学习 |
标注数据 |
正确答案 |
图像分类 |
无监督学习 |
无标注数据 |
无反馈 |
聚类 |
强化学习 |
交互产生 |
奖励信号 |
游戏、机器人 |
马尔可夫决策过程(MDP)
强化学习问题通常建模为 MDP:
S: 状态空间
A: 动作空间
P(s’|s,a): 状态转移概率
R(s,a,s’): 奖励函数
γ: 折扣因子(0 < γ ≤ 1)
import numpy as np
class SimpleMDP:
"""简单的 MDP 环境"""
def __init__(self):
# 状态: 0, 1, 2, 3, 4 (4 是终止状态)
self.n_states = 5
self.n_actions = 2 # 0: 左, 1: 右
self.terminal_state = 4
# 奖励: 到达终止状态得 +10
self.rewards = {4: 10}
def step(self, state, action):
"""执行动作,返回 (新状态, 奖励, 是否结束)"""
if state == self.terminal_state:
return state, 0, True
# 状态转移
if action == 0: # 左
new_state = max(0, state - 1)
else: # 右
new_state = min(self.terminal_state, state + 1)
reward = self.rewards.get(new_state, -1) # 每步 -1 鼓励快速到达
done = new_state == self.terminal_state
return new_state, reward, done
价值函数
1. 状态价值函数 V(s)
从状态 s 开始,遵循策略 π 的期望累积奖励。
def compute_state_values(mdp, policy, gamma=0.9, iterations=100):
"""策略评估: 计算状态价值"""
V = np.zeros(mdp.n_states)
for _ in range(iterations):
V_new = np.zeros(mdp.n_states)
for s in range(mdp.n_states):
if s == mdp.terminal_state:
continue
action = policy[s]
next_state, reward, done = mdp.step(s, action)
if done:
V_new[s] = reward
else:
V_new[s] = reward + gamma * V[next_state]
V = V_new
return V
2. 动作价值函数 Q(s, a)
从状态 s 执行动作 a 后的期望累积奖励。
def compute_q_values(mdp, V, gamma=0.9):
"""计算 Q 值"""
Q = np.zeros((mdp.n_states, mdp.n_actions))
for s in range(mdp.n_states):
for a in range(mdp.n_actions):
next_state, reward, done = mdp.step(s, a)
if done:
Q[s, a] = reward
else:
Q[s, a] = reward + gamma * V[next_state]
return Q
Q-Learning
Q-Learning 是一种无模型的强化学习算法。
import numpy as np
import random
class QLearning:
"""Q-Learning 算法"""
def __init__(self, n_states, n_actions, lr=0.1, gamma=0.99, epsilon=1.0):
self.n_states = n_states
self.n_actions = n_actions
self.lr = lr # 学习率
self.gamma = gamma # 折扣因子
self.epsilon = epsilon # 探索率
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
# Q 表
self.Q = np.zeros((n_states, n_actions))
def select_action(self, state):
"""ε-贪婪策略选择动作"""
if random.random() < self.epsilon:
return random.randint(0, self.n_actions - 1)
return np.argmax(self.Q[state])
def learn(self, state, action, reward, next_state, done):
"""更新 Q 值"""
if done:
target = reward
else:
target = reward + self.gamma * np.max(self.Q[next_state])
# Q-Learning 更新公式
self.Q[state, action] += self.lr * (target - self.Q[state, action])
# 衰减探索率
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# 训练
def train_q_learning(env, agent, episodes=1000):
rewards_history = []
for episode in range(episodes):
state = 0 # 初始状态
total_reward = 0
for step in range(100):
action = agent.select_action(state)
next_state, reward, done = env.step(state, action)
agent.learn(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if done:
break
rewards_history.append(total_reward)
if (episode + 1) % 100 == 0:
avg_reward = np.mean(rewards_history[-100:])
print(f"Episode {episode+1}, Avg Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
return rewards_history
# 运行
env = SimpleMDP()
agent = QLearning(env.n_states, env.n_actions)
rewards = train_q_learning(env, agent)
print("\n学到的 Q 表:")
print(agent.Q)
print("\n最优策略:")
for s in range(env.n_states - 1):
action = "右" if np.argmax(agent.Q[s]) == 1 else "左"
print(f" 状态 {s}: {action}")
深度 Q 网络(DQN)
用神经网络近似 Q 函数,处理大状态空间。
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
class DQN(nn.Module):
"""深度 Q 网络"""
def __init__(self, state_size, action_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(state_size, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, action_size)
)
def forward(self, x):
return self.network(x)
class DQNAgent:
"""DQN 智能体"""
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
# 超参数
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.batch_size = 32
# 经验回放
self.memory = deque(maxlen=10000)
# 网络
self.q_network = DQN(state_size, action_size)
self.target_network = DQN(state_size, action_size)
self.update_target_network()
self.optimizer = optim.Adam(self.q_network.parameters(), lr=0.001)
def update_target_network(self):
"""更新目标网络"""
self.target_network.load_state_dict(self.q_network.state_dict())
def remember(self, state, action, reward, next_state, done):
"""存储经验"""
self.memory.append((state, action, reward, next_state, done))
def select_action(self, state):
"""选择动作"""
if random.random() < self.epsilon:
return random.randint(0, self.action_size - 1)
with torch.no_grad():
state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
q_values = self.q_network(state_tensor)
return q_values.argmax().item()
def replay(self):
"""经验回放学习"""
if len(self.memory) < self.batch_size:
return
# 采样
batch = random.sample(self.memory, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.tensor(states, dtype=torch.float32)
actions = torch.tensor(actions, dtype=torch.long)
rewards = torch.tensor(rewards, dtype=torch.float32)
next_states = torch.tensor(next_states, dtype=torch.float32)
dones = torch.tensor(dones, dtype=torch.float32)
# 计算目标 Q 值
with torch.no_grad():
next_q_values = self.target_network(next_states).max(1)[0]
targets = rewards + (1 - dones) * self.gamma * next_q_values
# 计算当前 Q 值
current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze()
# 更新网络
loss = nn.MSELoss()(current_q_values, targets)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 衰减探索率
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
策略梯度方法
直接优化策略,而不是学习价值函数。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
class PolicyNetwork(nn.Module):
"""策略网络"""
def __init__(self, state_size, action_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(state_size, 64),
nn.ReLU(),
nn.Linear(64, action_size),
nn.Softmax(dim=-1)
)
def forward(self, x):
return self.network(x)
class REINFORCEAgent:
"""REINFORCE 算法"""
def __init__(self, state_size, action_size):
self.policy = PolicyNetwork(state_size, action_size)
self.optimizer = optim.Adam(self.policy.parameters(), lr=0.01)
self.gamma = 0.99
# 存储一个回合的数据
self.log_probs = []
self.rewards = []
def select_action(self, state):
"""根据策略选择动作"""
state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
probs = self.policy(state_tensor)
dist = Categorical(probs)
action = dist.sample()
self.log_probs.append(dist.log_prob(action))
return action.item()
def store_reward(self, reward):
"""存储奖励"""
self.rewards.append(reward)
def learn(self):
"""回合结束后学习"""
# 计算折扣回报
returns = []
G = 0
for r in reversed(self.rewards):
G = r + self.gamma * G
returns.insert(0, G)
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-8) # 标准化
# 计算策略梯度损失
policy_loss = []
for log_prob, G in zip(self.log_probs, returns):
policy_loss.append(-log_prob * G)
loss = torch.stack(policy_loss).sum()
# 更新策略
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 清空缓存
self.log_probs = []
self.rewards = []
return loss.item()
实战:CartPole 游戏
import gym
import torch
import numpy as np
# 创建环境
# pip install gym
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0] # 4
action_size = env.action_space.n # 2
# 使用 DQN 训练
agent = DQNAgent(state_size, action_size)
episodes = 500
target_update_freq = 10
for episode in range(episodes):
state = env.reset()[0]
total_reward = 0
for step in range(500):
action = agent.select_action(state)
next_state, reward, done, truncated, info = env.step(action)
agent.remember(state, action, reward, next_state, done)
agent.replay()
state = next_state
total_reward += reward
if done or truncated:
break
# 更新目标网络
if (episode + 1) % target_update_freq == 0:
agent.update_target_network()
if (episode + 1) % 50 == 0:
print(f"Episode {episode+1}, Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}")
env.close()
强化学习算法总结
算法 |
类型 |
特点 |
适用场景 |
|---|---|---|---|
Q-Learning |
值函数 |
简单,离线学习 |
小状态空间 |
DQN |
值函数+深度学习 |
处理大状态空间 |
Atari 游戏 |
REINFORCE |
策略梯度 |
直接优化策略 |
连续动作 |
A2C/A3C |
Actor-Critic |
结合值和策略 |
通用 |
PPO |
策略梯度 |
稳定,广泛使用 |
机器人、游戏 |
关键概念总结
概念 |
解释 |
|---|---|
MDP |
马尔可夫决策过程,强化学习的数学框架 |
策略 |
状态到动作的映射 |
价值函数 |
状态或动作的长期价值 |
探索与利用 |
尝试新动作 vs 使用已知最优 |
经验回放 |
存储和重用过去的经验 |
目标网络 |
稳定 DQN 训练的技术 |
总结
恭喜你完成了人工智能基础教程!
你已经学习了:
✅ 人工智能的定义和历史
✅ 智能体的概念和类型
✅ 搜索算法(BFS、DFS、A*)
✅ 知识表示与推理
✅ 机器学习基础
✅ 神经网络原理
✅ PyTorch 深度学习实战
✅ 自然语言处理基础
✅ 计算机视觉基础
✅ 强化学习基础
下一步建议
深入学习: 选择感兴趣的方向深入研究
实践项目: 动手实现完整的 AI 项目
阅读论文: 了解最新研究进展
参与社区: 加入 AI 学习社区交流
🎉 祝你在 AI 学习之路上不断进步!