Tutorial 8: RAG 系统评估
为什么需要评估?
评估是改进 RAG 系统的基础,帮助我们:
量化系统性能
发现问题和瓶颈
比较不同方案
指导优化方向
RAG 评估维度
RAG 评估框架
┌─────────────────────────────────────────────────────────────┐
│ 检索评估 │
│ · 召回率:找到了多少相关文档? │
│ · 精确率:返回的文档有多少是相关的? │
│ · MRR:第一个相关文档的排名 │
├─────────────────────────────────────────────────────────────┤
│ 生成评估 │
│ · 准确性:回答是否正确? │
│ · 完整性:回答是否全面? │
│ · 相关性:回答是否切题? │
├─────────────────────────────────────────────────────────────┤
│ 端到端评估 │
│ · 用户满意度 │
│ · 任务完成率 │
│ · 响应时间 │
└─────────────────────────────────────────────────────────────┘
检索评估指标
from typing import List, Set
import numpy as np
class RetrievalMetrics:
"""检索评估指标"""
@staticmethod
def precision_at_k(retrieved: List[str], relevant: Set[str], k: int) -> float:
"""Precision@K: 前K个结果中相关的比例"""
retrieved_k = retrieved[:k]
relevant_count = sum(1 for doc in retrieved_k if doc in relevant)
return relevant_count / k if k > 0 else 0.0
@staticmethod
def recall_at_k(retrieved: List[str], relevant: Set[str], k: int) -> float:
"""Recall@K: 找到的相关文档占所有相关文档的比例"""
retrieved_k = set(retrieved[:k])
found = len(retrieved_k & relevant)
return found / len(relevant) if relevant else 0.0
@staticmethod
def mrr(retrieved: List[str], relevant: Set[str]) -> float:
"""MRR: 第一个相关文档的排名倒数"""
for i, doc in enumerate(retrieved):
if doc in relevant:
return 1.0 / (i + 1)
return 0.0
@staticmethod
def ndcg_at_k(retrieved: List[str], relevant: Set[str], k: int) -> float:
"""NDCG@K: 归一化折损累积增益"""
dcg = 0.0
for i, doc in enumerate(retrieved[:k]):
if doc in relevant:
dcg += 1.0 / np.log2(i + 2) # i+2 因为 log2(1) = 0
# 理想DCG
ideal_dcg = sum(1.0 / np.log2(i + 2) for i in range(min(k, len(relevant))))
return dcg / ideal_dcg if ideal_dcg > 0 else 0.0
# 使用示例
metrics = RetrievalMetrics()
# 假设检索结果和真实相关文档
retrieved = ["doc1", "doc3", "doc5", "doc2", "doc4"]
relevant = {"doc1", "doc2", "doc6"}
print(f"Precision@3: {metrics.precision_at_k(retrieved, relevant, 3):.3f}")
print(f"Recall@3: {metrics.recall_at_k(retrieved, relevant, 3):.3f}")
print(f"MRR: {metrics.mrr(retrieved, relevant):.3f}")
print(f"NDCG@5: {metrics.ndcg_at_k(retrieved, relevant, 5):.3f}")
生成评估指标
1. 基于参考答案的评估
from collections import Counter
import re
class GenerationMetrics:
"""生成评估指标"""
@staticmethod
def exact_match(prediction: str, reference: str) -> float:
"""精确匹配"""
return 1.0 if prediction.strip() == reference.strip() else 0.0
@staticmethod
def f1_score(prediction: str, reference: str) -> float:
"""F1 分数(基于词重叠)"""
pred_tokens = set(prediction.lower().split())
ref_tokens = set(reference.lower().split())
if not pred_tokens or not ref_tokens:
return 0.0
common = pred_tokens & ref_tokens
precision = len(common) / len(pred_tokens)
recall = len(common) / len(ref_tokens)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
@staticmethod
def bleu_score(prediction: str, reference: str, n: int = 4) -> float:
"""简化版 BLEU 分数"""
pred_tokens = prediction.lower().split()
ref_tokens = reference.lower().split()
if len(pred_tokens) == 0:
return 0.0
# 计算 n-gram 精确率
scores = []
for i in range(1, n + 1):
pred_ngrams = Counter(
tuple(pred_tokens[j:j+i]) for j in range(len(pred_tokens) - i + 1)
)
ref_ngrams = Counter(
tuple(ref_tokens[j:j+i]) for j in range(len(ref_tokens) - i + 1)
)
overlap = sum((pred_ngrams & ref_ngrams).values())
total = sum(pred_ngrams.values())
scores.append(overlap / total if total > 0 else 0)
# 几何平均
if all(s > 0 for s in scores):
return np.exp(np.mean(np.log(scores)))
return 0.0
# 使用
gen_metrics = GenerationMetrics()
prediction = "Python是一种高级编程语言"
reference = "Python是一种简洁易读的高级编程语言"
print(f"Exact Match: {gen_metrics.exact_match(prediction, reference)}")
print(f"F1 Score: {gen_metrics.f1_score(prediction, reference):.3f}")
print(f"BLEU Score: {gen_metrics.bleu_score(prediction, reference):.3f}")
2. 基于 LLM 的评估
from langchain_openai import ChatOpenAI
class LLMEvaluator:
"""基于 LLM 的评估器"""
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def evaluate_faithfulness(self, answer: str, context: str) -> dict:
"""评估答案的忠实度(是否基于上下文)"""
prompt = f"""评估以下回答是否忠实于给定的上下文。
上下文:
{context}
回答:
{answer}
请评估:
1. 回答中的信息是否都来自上下文?
2. 是否有编造的信息?
输出格式:
忠实度分数(0-10):
原因:"""
response = self.llm.invoke(prompt).content
# 解析分数(简化处理)
try:
score = int(re.search(r'(\d+)', response.split('\n')[0]).group(1))
except:
score = 5
return {"score": score, "explanation": response}
def evaluate_relevance(self, answer: str, question: str) -> dict:
"""评估答案的相关性"""
prompt = f"""评估以下回答与问题的相关性。
问题:{question}
回答:{answer}
请评估:
1. 回答是否直接回答了问题?
2. 是否有无关的信息?
输出格式:
相关性分数(0-10):
原因:"""
response = self.llm.invoke(prompt).content
try:
score = int(re.search(r'(\d+)', response.split('\n')[0]).group(1))
except:
score = 5
return {"score": score, "explanation": response}
def evaluate_completeness(self, answer: str, question: str, context: str) -> dict:
"""评估答案的完整性"""
prompt = f"""评估回答是否完整地回答了问题。
问题:{question}
上下文:{context}
回答:{answer}
请评估:
1. 回答是否涵盖了问题的所有方面?
2. 是否遗漏了重要信息?
输出格式:
完整性分数(0-10):
原因:"""
response = self.llm.invoke(prompt).content
try:
score = int(re.search(r'(\d+)', response.split('\n')[0]).group(1))
except:
score = 5
return {"score": score, "explanation": response}
# 使用
evaluator = LLMEvaluator()
context = "Python是一种高级编程语言,以简洁易读著称。Python支持多种编程范式。"
question = "Python有什么特点?"
answer = "Python是一种简洁易读的高级编程语言,支持多种编程范式。"
faithfulness = evaluator.evaluate_faithfulness(answer, context)
relevance = evaluator.evaluate_relevance(answer, question)
print(f"忠实度: {faithfulness['score']}/10")
print(f"相关性: {relevance['score']}/10")
端到端评估
from typing import List, Dict
from dataclasses import dataclass
import time
@dataclass
class TestCase:
question: str
expected_answer: str
relevant_docs: List[str]
class RAGEvaluator:
"""RAG 系统端到端评估器"""
def __init__(self, rag_system):
self.rag_system = rag_system
self.retrieval_metrics = RetrievalMetrics()
self.generation_metrics = GenerationMetrics()
def evaluate_single(self, test_case: TestCase) -> Dict:
"""评估单个测试用例"""
start_time = time.time()
# 运行 RAG 系统
result = self.rag_system.query(test_case.question)
latency = time.time() - start_time
# 检索评估
retrieved_docs = [doc.page_content for doc in result['retrieved_docs']]
relevant_set = set(test_case.relevant_docs)
retrieval_scores = {
"precision@3": self.retrieval_metrics.precision_at_k(
retrieved_docs, relevant_set, 3
),
"recall@3": self.retrieval_metrics.recall_at_k(
retrieved_docs, relevant_set, 3
),
"mrr": self.retrieval_metrics.mrr(retrieved_docs, relevant_set)
}
# 生成评估
generation_scores = {
"f1": self.generation_metrics.f1_score(
result['answer'], test_case.expected_answer
),
"bleu": self.generation_metrics.bleu_score(
result['answer'], test_case.expected_answer
)
}
return {
"question": test_case.question,
"answer": result['answer'],
"expected": test_case.expected_answer,
"retrieval": retrieval_scores,
"generation": generation_scores,
"latency": latency
}
def evaluate_batch(self, test_cases: List[TestCase]) -> Dict:
"""批量评估"""
results = []
for tc in test_cases:
result = self.evaluate_single(tc)
results.append(result)
# 聚合指标
avg_metrics = {
"retrieval": {
"precision@3": np.mean([r["retrieval"]["precision@3"] for r in results]),
"recall@3": np.mean([r["retrieval"]["recall@3"] for r in results]),
"mrr": np.mean([r["retrieval"]["mrr"] for r in results])
},
"generation": {
"f1": np.mean([r["generation"]["f1"] for r in results]),
"bleu": np.mean([r["generation"]["bleu"] for r in results])
},
"latency": {
"mean": np.mean([r["latency"] for r in results]),
"p95": np.percentile([r["latency"] for r in results], 95)
}
}
return {
"individual_results": results,
"aggregate_metrics": avg_metrics
}
构建评估数据集
import json
def create_evaluation_dataset():
"""创建评估数据集"""
dataset = [
{
"question": "什么是Python?",
"expected_answer": "Python是一种高级编程语言,以简洁易读著称。",
"relevant_docs": ["doc_python_intro", "doc_python_features"]
},
{
"question": "机器学习有哪些类型?",
"expected_answer": "机器学习主要分为监督学习、无监督学习和强化学习。",
"relevant_docs": ["doc_ml_types", "doc_ml_intro"]
},
# ... 更多测试用例
]
return dataset
def save_dataset(dataset, filepath):
"""保存数据集"""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
def load_dataset(filepath):
"""加载数据集"""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
评估报告
def generate_evaluation_report(evaluation_results: Dict) -> str:
"""生成评估报告"""
agg = evaluation_results["aggregate_metrics"]
report = f"""
========================================
RAG 系统评估报告
========================================
检索性能
--------
- Precision@3: {agg['retrieval']['precision@3']:.3f}
- Recall@3: {agg['retrieval']['recall@3']:.3f}
- MRR: {agg['retrieval']['mrr']:.3f}
生成性能
--------
- F1 Score: {agg['generation']['f1']:.3f}
- BLEU Score: {agg['generation']['bleu']:.3f}
延迟性能
--------
- 平均延迟: {agg['latency']['mean']:.2f}s
- P95 延迟: {agg['latency']['p95']:.2f}s
测试用例数: {len(evaluation_results['individual_results'])}
========================================
"""
return report
关键概念总结
指标 |
解释 |
适用场景 |
|---|---|---|
Precision@K |
前K个结果中相关的比例 |
检索精度 |
Recall@K |
找到的相关文档比例 |
检索召回 |
MRR |
第一个相关结果的排名倒数 |
排序质量 |
F1 Score |
词重叠的调和平均 |
生成质量 |
忠实度 |
答案是否基于上下文 |
幻觉检测 |
下一步
在下一个教程中,我们将学习高级 RAG 技术。