Tutorial 8: 自然语言处理基础

什么是 NLP?

自然语言处理(NLP) 是让计算机理解、处理和生成人类语言的技术。

NLP 应用领域
├── 文本分类(垃圾邮件检测、情感分析)
├── 命名实体识别(识别人名、地名等)
├── 机器翻译(中文→英文)
├── 问答系统(ChatGPT)
├── 文本摘要
└── 文本生成

NLP 的挑战

  • 歧义性: “苹果很好吃” vs “苹果股价上涨”

  • 上下文依赖: “它” 指代什么?

  • 语言多样性: 方言、俚语、新词

  • 常识推理: 需要背景知识

文本预处理

import re
from collections import Counter

class TextPreprocessor:
    """文本预处理器"""

    def __init__(self):
        pass

    def clean_text(self, text):
        """清洗文本"""
        # 转小写
        text = text.lower()
        # 去除特殊字符
        text = re.sub(r'[^\w\s]', '', text)
        # 去除多余空格
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize(self, text):
        """分词"""
        return text.split()

    def remove_stopwords(self, tokens, stopwords):
        """去除停用词"""
        return [t for t in tokens if t not in stopwords]

# 示例
preprocessor = TextPreprocessor()

text = "Hello, World! This is a NLP tutorial. NLP is amazing!"
clean = preprocessor.clean_text(text)
tokens = preprocessor.tokenize(clean)

print(f"原文: {text}")
print(f"清洗后: {clean}")
print(f"分词: {tokens}")

中文分词

# 中文需要特殊的分词工具
# pip install jieba

import jieba

text = "自然语言处理是人工智能的重要分支"
tokens = list(jieba.cut(text))
print(f"分词结果: {tokens}")
# ['自然语言', '处理', '是', '人工智能', '的', '重要', '分支']

文本表示

1. 词袋模型(Bag of Words)

from collections import Counter
import numpy as np

class BagOfWords:
    """词袋模型"""

    def __init__(self):
        self.vocabulary = {}

    def fit(self, documents):
        """构建词汇表"""
        all_words = set()
        for doc in documents:
            all_words.update(doc.split())

        self.vocabulary = {word: idx for idx, word in enumerate(sorted(all_words))}
        return self

    def transform(self, documents):
        """转换为向量"""
        vectors = []
        for doc in documents:
            vector = np.zeros(len(self.vocabulary))
            for word in doc.split():
                if word in self.vocabulary:
                    vector[self.vocabulary[word]] += 1
            vectors.append(vector)
        return np.array(vectors)

# 示例
docs = [
    "i love machine learning",
    "machine learning is great",
    "deep learning is a subset of machine learning"
]

bow = BagOfWords()
bow.fit(docs)
vectors = bow.transform(docs)

print(f"词汇表: {bow.vocabulary}")
print(f"向量形状: {vectors.shape}")

2. TF-IDF

import numpy as np
from collections import Counter

class TFIDF:
    """TF-IDF 向量化"""

    def __init__(self):
        self.vocabulary = {}
        self.idf = {}

    def fit(self, documents):
        # 构建词汇表
        all_words = set()
        for doc in documents:
            all_words.update(doc.split())
        self.vocabulary = {word: idx for idx, word in enumerate(sorted(all_words))}

        # 计算 IDF
        n_docs = len(documents)
        doc_freq = Counter()
        for doc in documents:
            unique_words = set(doc.split())
            doc_freq.update(unique_words)

        self.idf = {
            word: np.log(n_docs / (df + 1)) + 1
            for word, df in doc_freq.items()
        }
        return self

    def transform(self, documents):
        vectors = []
        for doc in documents:
            # 计算 TF
            word_counts = Counter(doc.split())
            total_words = len(doc.split())

            vector = np.zeros(len(self.vocabulary))
            for word, count in word_counts.items():
                if word in self.vocabulary:
                    tf = count / total_words
                    idf = self.idf.get(word, 1)
                    vector[self.vocabulary[word]] = tf * idf

            vectors.append(vector)
        return np.array(vectors)

3. 词嵌入(Word Embeddings)

词嵌入将词映射到稠密的向量空间。

import torch
import torch.nn as nn

# 使用 PyTorch 的 Embedding 层
vocab_size = 10000
embedding_dim = 100

embedding = nn.Embedding(vocab_size, embedding_dim)

# 词索引
word_indices = torch.tensor([1, 5, 100, 999])

# 获取词向量
word_vectors = embedding(word_indices)
print(f"词向量形状: {word_vectors.shape}")  # [4, 100]

# 计算词相似度
def cosine_similarity(v1, v2):
    return torch.dot(v1, v2) / (v1.norm() * v2.norm())

Word2Vec 简化实现

import torch
import torch.nn as nn
import torch.optim as optim

class SkipGram(nn.Module):
    """Skip-gram 模型"""

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context, negative):
        """
        center: 中心词索引 [batch]
        context: 上下文词索引 [batch]
        negative: 负采样词索引 [batch, num_neg]
        """
        # 获取嵌入
        center_emb = self.center_embeddings(center)      # [batch, dim]
        context_emb = self.context_embeddings(context)   # [batch, dim]
        neg_emb = self.context_embeddings(negative)      # [batch, num_neg, dim]

        # 正样本得分
        pos_score = torch.sum(center_emb * context_emb, dim=1)  # [batch]
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)

        # 负样本得分
        neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()  # [batch, num_neg]
        neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-10).sum(dim=1)

        return (pos_loss + neg_loss).mean()

    def get_embedding(self, word_idx):
        return self.center_embeddings(torch.tensor([word_idx]))

循环神经网络(RNN)

RNN 能处理序列数据,记住之前的信息。

import torch
import torch.nn as nn

# 基本 RNN
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: [batch, seq_len, input_size]
        output, hidden = self.rnn(x)
        # output: [batch, seq_len, hidden_size]
        # hidden: [1, batch, hidden_size]

        # 使用最后一个时间步的输出
        last_output = output[:, -1, :]
        return self.fc(last_output)

# LSTM(解决梯度消失问题)
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # x: [batch, seq_len]
        embedded = self.embedding(x)  # [batch, seq_len, embedding_dim]
        output, (hidden, cell) = self.lstm(embedded)

        # 拼接双向 LSTM 的最后隐藏状态
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.fc(self.dropout(hidden_cat))

实战:情感分析

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# 1. 数据准备
# 模拟数据
texts = [
    "this movie is great",
    "i love this film",
    "amazing performance",
    "terrible movie",
    "i hate this",
    "worst film ever",
    "excellent story",
    "boring and slow"
]
labels = [1, 1, 1, 0, 0, 0, 1, 0]  # 1: 正面, 0: 负面

# 构建词汇表
all_words = []
for text in texts:
    all_words.extend(text.lower().split())

word_counts = Counter(all_words)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# 文本转索引
def text_to_indices(text, vocab, max_len=10):
    indices = [vocab.get(word, vocab['<UNK>']) for word in text.lower().split()]
    # 填充或截断
    if len(indices) < max_len:
        indices += [vocab['<PAD>']] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    return indices

# 2. 数据集
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=10):
        self.data = [
            (torch.tensor(text_to_indices(text, vocab, max_len)), label)
            for text, label in zip(texts, labels)
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

dataset = SentimentDataset(texts, labels, vocab)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

# 3. 模型
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, hidden_size=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden.squeeze(0))

model = SentimentLSTM(len(vocab))
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 4. 训练
print("训练情感分析模型...")
for epoch in range(100):
    total_loss = 0
    for batch_x, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_x).squeeze()
        loss = criterion(outputs, batch_y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

# 5. 测试
model.eval()
test_texts = ["this is amazing", "i hate it"]
for text in test_texts:
    indices = torch.tensor([text_to_indices(text, vocab)])
    with torch.no_grad():
        output = torch.sigmoid(model(indices))
        sentiment = "正面" if output > 0.5 else "负面"
        print(f"'{text}' -> {sentiment} ({output.item():.2f})")

Transformer 简介

Transformer 是现代 NLP 的基础架构(GPT、BERT 等都基于它)。

import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    """自注意力机制"""

    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.out = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.shape

        # 计算 Q, K, V
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # 注意力分数
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attention = torch.softmax(scores, dim=-1)

        # 加权求和
        out = torch.matmul(attention, V)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

        return self.out(out)

class TransformerBlock(nn.Module):
    """Transformer 块"""

    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = SelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 自注意力 + 残差连接
        x = x + self.dropout(self.attention(self.norm1(x)))
        # 前馈网络 + 残差连接
        x = x + self.dropout(self.ff(self.norm2(x)))
        return x

关键概念总结

概念

解释

分词

将文本切分成词或子词

词嵌入

将词映射到稠密向量

RNN/LSTM

处理序列的循环神经网络

注意力机制

让模型关注重要部分

Transformer

基于注意力的现代架构

下一步

在下一个教程中,我们将学习计算机视觉的基础知识。

Tutorial 9: 计算机视觉基础