# 内存问题 Python 的自动内存管理通常工作得很好,但了解这些内存相关的陷阱可以帮助你写出更高效的代码。 ## 引用计数与循环引用 ### 引用计数基础 ```python import sys # 查看引用计数 a = [1, 2, 3] print(sys.getrefcount(a)) # 2 (a 本身 + getrefcount 的参数) b = a # 增加引用 print(sys.getrefcount(a)) # 3 del b # 减少引用 print(sys.getrefcount(a)) # 2 ``` ### 循环引用 ```python import gc class Node: def __init__(self, value): self.value = value self.neighbor = None # 创建循环引用 a = Node(1) b = Node(2) a.neighbor = b b.neighbor = a # 循环引用! # 即使删除变量,对象也不会被立即回收 del a del b # 内存还在,直到 GC 运行 # 手动触发 GC gc.collect() # ✅ 使用弱引用打破循环 import weakref class Node: def __init__(self, value): self.value = value self._neighbor = None @property def neighbor(self): return self._neighbor() if self._neighbor else None @neighbor.setter def neighbor(self, node): self._neighbor = weakref.ref(node) if node else None ``` ### 检测内存泄漏 ```python import gc import tracemalloc # 启动内存跟踪 tracemalloc.start() # 你的代码 data = [list(range(10000)) for _ in range(100)] # 获取内存快照 snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics('lineno') print("Top 10 memory allocations:") for stat in top_stats[:10]: print(stat) # 比较两个快照 snapshot1 = tracemalloc.take_snapshot() # ... 做一些操作 ... snapshot2 = tracemalloc.take_snapshot() top_stats = snapshot2.compare_to(snapshot1, 'lineno') print("Memory differences:") for stat in top_stats[:10]: print(stat) ``` ## 大对象处理 ### 生成器代替列表 ```python import sys # ❌ 列表占用大量内存 def get_data_list(): return [process(i) for i in range(1000000)] data = get_data_list() print(sys.getsizeof(data)) # ~8MB # ✅ 生成器几乎不占内存 def get_data_generator(): for i in range(1000000): yield process(i) gen = get_data_generator() print(sys.getsizeof(gen)) # ~200 bytes ``` ### 分块处理大文件 ```python # ❌ 一次性读取 with open('huge_file.txt') as f: data = f.read() # 可能耗尽内存 # ✅ 分块读取 def read_in_chunks(file_path, chunk_size=1024*1024): with open(file_path, 'rb') as f: while True: chunk = f.read(chunk_size) if not chunk: break yield chunk for chunk in read_in_chunks('huge_file.bin'): process(chunk) # ✅ 逐行读取 with open('huge_file.txt') as f: for line in f: # 迭代器,内存友好 process(line) ``` ### numpy 内存映射 ```python import numpy as np # ❌ 加载整个数组到内存 data = np.load('huge_array.npy') # 可能耗尽内存 # ✅ 使用内存映射 data = np.load('huge_array.npy', mmap_mode='r') # 只在访问时加载需要的部分 print(data[0:100]) # 只加载前 100 个元素 ``` ## __slots__ 优化 ```python import sys class PointWithDict: def __init__(self, x, y): self.x = x self.y = y class PointWithSlots: __slots__ = ['x', 'y'] def __init__(self, x, y): self.x = x self.y = y # 内存对比 p1 = PointWithDict(1, 2) p2 = PointWithSlots(1, 2) print(sys.getsizeof(p1) + sys.getsizeof(p1.__dict__)) # ~152 bytes print(sys.getsizeof(p2)) # ~56 bytes # 创建大量对象时差异明显 points_dict = [PointWithDict(i, i) for i in range(100000)] points_slots = [PointWithSlots(i, i) for i in range(100000)] # slots 版本节省约 50% 内存 ``` ## 字符串驻留 ```python import sys # Python 自动驻留一些字符串 a = "hello" b = "hello" print(a is b) # True (驻留) # 但不是所有字符串 a = "hello world" b = "hello world" print(a is b) # 可能是 False # 手动驻留 a = sys.intern("hello world") b = sys.intern("hello world") print(a is b) # True # 适用于大量重复字符串的场景 # 如处理日志、配置键等 ``` ## 缓存注意事项 ### lru_cache 内存泄漏 ```python from functools import lru_cache class DataProcessor: def __init__(self, data): self.data = data @lru_cache(maxsize=128) def process(self, key): # ⚠️ 问题:self 被缓存,导致实例无法被回收 return expensive_operation(self.data, key) # 每个实例都会被缓存持有引用 # 即使不再使用,实例也不会被回收 # ✅ 解决方案1:使用类方法或静态方法 class DataProcessor: @staticmethod @lru_cache(maxsize=128) def process(data_tuple, key): return expensive_operation(data_tuple, key) # ✅ 解决方案2:在 __del__ 中清除缓存 class DataProcessor: @lru_cache(maxsize=128) def process(self, key): return expensive_operation(self.data, key) def __del__(self): self.process.cache_clear() ``` ### 全局缓存增长 ```python # ❌ 无限增长的缓存 cache = {} def get_data(key): if key not in cache: cache[key] = expensive_operation(key) return cache[key] # 缓存会无限增长,最终耗尽内存 # ✅ 使用 LRU 缓存 from functools import lru_cache @lru_cache(maxsize=1000) # 限制大小 def get_data(key): return expensive_operation(key) # ✅ 使用带过期的缓存 from cachetools import TTLCache cache = TTLCache(maxsize=1000, ttl=300) # 5分钟过期 ``` ## 内存分析工具 ### memory_profiler ```python # pip install memory_profiler from memory_profiler import profile @profile def memory_intensive(): a = [1] * 1000000 b = [2] * 2000000 del b return a memory_intensive() # 运行: python -m memory_profiler script.py ``` ### objgraph ```python # pip install objgraph import objgraph # 查看最常见的对象类型 objgraph.show_most_common_types() # 查看对象增长 objgraph.show_growth() # 查找循环引用 objgraph.show_refs([obj], filename='refs.png') # 查找引用某对象的对象 objgraph.show_backrefs([obj], filename='backrefs.png') ``` ### tracemalloc ```python import tracemalloc tracemalloc.start() # 你的代码 data = [list(range(1000)) for _ in range(1000)] current, peak = tracemalloc.get_traced_memory() print(f"Current: {current / 1024 / 1024:.2f} MB") print(f"Peak: {peak / 1024 / 1024:.2f} MB") tracemalloc.stop() ``` ## 最佳实践 ::::{grid} 1 :gutter: 2 :::{grid-item-card} 内存优化原则 1. **使用生成器**:处理大数据集 2. **分块处理**:大文件、大数据库结果 3. **及时释放**:不再需要的大对象设为 `None` 或 `del` 4. **使用 `__slots__`**:创建大量小对象时 5. **限制缓存大小**:使用 `lru_cache` 的 `maxsize` 6. **避免循环引用**:使用弱引用 ::: :::{grid-item-card} 调试技巧 ```python # 检查对象大小 import sys print(sys.getsizeof(obj)) # 检查引用计数 print(sys.getrefcount(obj)) # 手动 GC import gc gc.collect() # 列出 GC 跟踪的对象 print(len(gc.get_objects())) ``` ::: ::::