4.3. 内存问题
Python 的自动内存管理通常工作得很好,但了解这些内存相关的陷阱可以帮助你写出更高效的代码。
4.3.1. 引用计数与循环引用
4.3.1.1. 引用计数基础
import sys
# 查看引用计数
a = [1, 2, 3]
print(sys.getrefcount(a)) # 2 (a 本身 + getrefcount 的参数)
b = a # 增加引用
print(sys.getrefcount(a)) # 3
del b # 减少引用
print(sys.getrefcount(a)) # 2
4.3.1.2. 循环引用
import gc
class Node:
def __init__(self, value):
self.value = value
self.neighbor = None
# 创建循环引用
a = Node(1)
b = Node(2)
a.neighbor = b
b.neighbor = a # 循环引用!
# 即使删除变量,对象也不会被立即回收
del a
del b
# 内存还在,直到 GC 运行
# 手动触发 GC
gc.collect()
# ✅ 使用弱引用打破循环
import weakref
class Node:
def __init__(self, value):
self.value = value
self._neighbor = None
@property
def neighbor(self):
return self._neighbor() if self._neighbor else None
@neighbor.setter
def neighbor(self, node):
self._neighbor = weakref.ref(node) if node else None
4.3.1.3. 检测内存泄漏
import gc
import tracemalloc
# 启动内存跟踪
tracemalloc.start()
# 你的代码
data = [list(range(10000)) for _ in range(100)]
# 获取内存快照
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
print("Top 10 memory allocations:")
for stat in top_stats[:10]:
print(stat)
# 比较两个快照
snapshot1 = tracemalloc.take_snapshot()
# ... 做一些操作 ...
snapshot2 = tracemalloc.take_snapshot()
top_stats = snapshot2.compare_to(snapshot1, 'lineno')
print("Memory differences:")
for stat in top_stats[:10]:
print(stat)
4.3.2. 大对象处理
4.3.2.1. 生成器代替列表
import sys
# ❌ 列表占用大量内存
def get_data_list():
return [process(i) for i in range(1000000)]
data = get_data_list()
print(sys.getsizeof(data)) # ~8MB
# ✅ 生成器几乎不占内存
def get_data_generator():
for i in range(1000000):
yield process(i)
gen = get_data_generator()
print(sys.getsizeof(gen)) # ~200 bytes
4.3.2.2. 分块处理大文件
# ❌ 一次性读取
with open('huge_file.txt') as f:
data = f.read() # 可能耗尽内存
# ✅ 分块读取
def read_in_chunks(file_path, chunk_size=1024*1024):
with open(file_path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
yield chunk
for chunk in read_in_chunks('huge_file.bin'):
process(chunk)
# ✅ 逐行读取
with open('huge_file.txt') as f:
for line in f: # 迭代器,内存友好
process(line)
4.3.2.3. numpy 内存映射
import numpy as np
# ❌ 加载整个数组到内存
data = np.load('huge_array.npy') # 可能耗尽内存
# ✅ 使用内存映射
data = np.load('huge_array.npy', mmap_mode='r')
# 只在访问时加载需要的部分
print(data[0:100]) # 只加载前 100 个元素
4.3.3. slots 优化
import sys
class PointWithDict:
def __init__(self, x, y):
self.x = x
self.y = y
class PointWithSlots:
__slots__ = ['x', 'y']
def __init__(self, x, y):
self.x = x
self.y = y
# 内存对比
p1 = PointWithDict(1, 2)
p2 = PointWithSlots(1, 2)
print(sys.getsizeof(p1) + sys.getsizeof(p1.__dict__)) # ~152 bytes
print(sys.getsizeof(p2)) # ~56 bytes
# 创建大量对象时差异明显
points_dict = [PointWithDict(i, i) for i in range(100000)]
points_slots = [PointWithSlots(i, i) for i in range(100000)]
# slots 版本节省约 50% 内存
4.3.4. 字符串驻留
import sys
# Python 自动驻留一些字符串
a = "hello"
b = "hello"
print(a is b) # True (驻留)
# 但不是所有字符串
a = "hello world"
b = "hello world"
print(a is b) # 可能是 False
# 手动驻留
a = sys.intern("hello world")
b = sys.intern("hello world")
print(a is b) # True
# 适用于大量重复字符串的场景
# 如处理日志、配置键等
4.3.5. 缓存注意事项
4.3.5.1. lru_cache 内存泄漏
from functools import lru_cache
class DataProcessor:
def __init__(self, data):
self.data = data
@lru_cache(maxsize=128)
def process(self, key):
# ⚠️ 问题:self 被缓存,导致实例无法被回收
return expensive_operation(self.data, key)
# 每个实例都会被缓存持有引用
# 即使不再使用,实例也不会被回收
# ✅ 解决方案1:使用类方法或静态方法
class DataProcessor:
@staticmethod
@lru_cache(maxsize=128)
def process(data_tuple, key):
return expensive_operation(data_tuple, key)
# ✅ 解决方案2:在 __del__ 中清除缓存
class DataProcessor:
@lru_cache(maxsize=128)
def process(self, key):
return expensive_operation(self.data, key)
def __del__(self):
self.process.cache_clear()
4.3.5.2. 全局缓存增长
# ❌ 无限增长的缓存
cache = {}
def get_data(key):
if key not in cache:
cache[key] = expensive_operation(key)
return cache[key]
# 缓存会无限增长,最终耗尽内存
# ✅ 使用 LRU 缓存
from functools import lru_cache
@lru_cache(maxsize=1000) # 限制大小
def get_data(key):
return expensive_operation(key)
# ✅ 使用带过期的缓存
from cachetools import TTLCache
cache = TTLCache(maxsize=1000, ttl=300) # 5分钟过期
4.3.6. 内存分析工具
4.3.6.1. memory_profiler
# pip install memory_profiler
from memory_profiler import profile
@profile
def memory_intensive():
a = [1] * 1000000
b = [2] * 2000000
del b
return a
memory_intensive()
# 运行: python -m memory_profiler script.py
4.3.6.2. objgraph
# pip install objgraph
import objgraph
# 查看最常见的对象类型
objgraph.show_most_common_types()
# 查看对象增长
objgraph.show_growth()
# 查找循环引用
objgraph.show_refs([obj], filename='refs.png')
# 查找引用某对象的对象
objgraph.show_backrefs([obj], filename='backrefs.png')
4.3.6.3. tracemalloc
import tracemalloc
tracemalloc.start()
# 你的代码
data = [list(range(1000)) for _ in range(1000)]
current, peak = tracemalloc.get_traced_memory()
print(f"Current: {current / 1024 / 1024:.2f} MB")
print(f"Peak: {peak / 1024 / 1024:.2f} MB")
tracemalloc.stop()
4.3.7. 最佳实践
内存优化原则
使用生成器:处理大数据集
分块处理:大文件、大数据库结果
及时释放:不再需要的大对象设为
None或del使用
__slots__:创建大量小对象时限制缓存大小:使用
lru_cache的maxsize避免循环引用:使用弱引用
调试技巧
# 检查对象大小
import sys
print(sys.getsizeof(obj))
# 检查引用计数
print(sys.getrefcount(obj))
# 手动 GC
import gc
gc.collect()
# 列出 GC 跟踪的对象
print(len(gc.get_objects()))