{"id":1745,"date":"2025-01-04T23:08:45","date_gmt":"2025-01-04T15:08:45","guid":{"rendered":"https:\/\/www.fanyamin.com\/wordpress\/?p=1745"},"modified":"2025-01-04T23:08:45","modified_gmt":"2025-01-04T15:08:45","slug":"%e6%96%87%e6%9c%ac%e5%88%86%e5%89%b2%e7%9a%84%e6%96%b9%e6%b3%95","status":"publish","type":"post","link":"https:\/\/www.fanyamin.com\/wordpress\/?p=1745","title":{"rendered":"\u6587\u672c\u5206\u5272\u7684\u65b9\u6cd5"},"content":{"rendered":"<p>\u957f\u6587\u672c\u5206\u5272\u662f\u6784\u5efa\u9ad8\u6548\u68c0\u7d22\u7cfb\u7edf\u7684\u91cd\u8981\u6b65\u9aa4\uff0c\u597d\u7684\u5206\u5272\u65b9\u6cd5\u9700\u8981\u517c\u987e\u8bed\u4e49\u5b8c\u6574\u6027\u548c\u5757\u5927\u5c0f\u9002\u4e2d\u3002\u6211\u4eec\u8981\u907f\u514d\u7b80\u5355\u7684\u56fa\u5b9a\u957f\u5ea6\u5207\u5206\u5bfc\u81f4\u8bed\u4e49\u4e22\u5931\u7684\u95ee\u9898\uff1a<\/p>\n<hr \/>\n<h3>1. <strong>\u57fa\u4e8e\u53e5\u5b50\u5206\u5272<\/strong><\/h3>\n<h4>\u65b9\u6cd5<\/h4>\n<p>\u4f7f\u7528\u81ea\u7136\u8bed\u8a00\u5904\u7406\u5de5\u5177\u5c06\u6587\u672c\u5206\u5272\u6210\u53e5\u5b50\uff0c\u7136\u540e\u518d\u7ec4\u5408\u6210\u9002\u5f53\u5927\u5c0f\u7684\u5757\u3002<\/p>\n<h4>\u5b9e\u73b0\u793a\u4f8b<\/h4>\n<pre><code class=\"language-python\">import nltk\nfrom nltk.tokenize import sent_tokenize\n\n# \u4e0b\u8f7d punkt \u5206\u8bcd\u5668\uff08\u9996\u6b21\u8fd0\u884c\u9700\u8981\uff09\nnltk.download(&#039;punkt&#039;)\n\ndef split_text_by_sentences(text, chunk_size=300):\n    sentences = sent_tokenize(text)\n    chunks = []\n    current_chunk = &quot;&quot;\n\n    for sentence in sentences:\n        # \u5982\u679c\u5f53\u524d\u5757\u52a0\u4e0a\u65b0\u53e5\u5b50\u7684\u957f\u5ea6\u8d85\u8fc7 chunk_size\uff0c\u5219\u5f00\u59cb\u65b0\u5757\n        if len(current_chunk) + len(sentence) &gt; chunk_size:\n            chunks.append(current_chunk.strip())\n            current_chunk = sentence\n        else:\n            current_chunk += &quot; &quot; + sentence\n\n    # \u6dfb\u52a0\u6700\u540e\u4e00\u4e2a\u5757\n    if current_chunk:\n        chunks.append(current_chunk.strip())\n\n    return chunks\n\n# \u793a\u4f8b\u4f7f\u7528\ntext_chunks = split_text_by_sentences(pdf_text, chunk_size=300)\nprint(f&quot;\u5206\u5272\u540e\u7684\u5757\u6570: {len(text_chunks)}&quot;)\nprint(text_chunks[:3])  # \u6253\u5370\u524d 3 \u4e2a\u5757<\/code><\/pre>\n<hr \/>\n<h3>2. <strong>\u57fa\u4e8e\u6bb5\u843d\u5206\u5272<\/strong><\/h3>\n<h4>\u65b9\u6cd5<\/h4>\n<p>\u5982\u679c\u6587\u672c\u683c\u5f0f\u5305\u542b\u6bb5\u843d\u4fe1\u606f\uff08\u5982 PDF \u63d0\u53d6\u7ed3\u679c\uff09\uff0c\u53ef\u4ee5\u4f18\u5148\u6309\u7167\u6bb5\u843d\u5206\u5272\uff0c\u786e\u4fdd\u5757\u7684\u8bed\u4e49\u5b8c\u6574\u6027\u3002<\/p>\n<h4>\u5b9e\u73b0\u793a\u4f8b<\/h4>\n<pre><code class=\"language-python\">def split_text_by_paragraphs(text, chunk_size=300):\n    paragraphs = text.split(&quot;\\n\\n&quot;)  # \u5047\u8bbe\u6bb5\u843d\u4ee5\u53cc\u6362\u884c\u5206\u9694\n    chunks = []\n    current_chunk = &quot;&quot;\n\n    for paragraph in paragraphs:\n        if len(current_chunk) + len(paragraph) &gt; chunk_size:\n            chunks.append(current_chunk.strip())\n            current_chunk = paragraph\n        else:\n            current_chunk += &quot; &quot; + paragraph\n\n    if current_chunk:\n        chunks.append(current_chunk.strip())\n\n    return chunks\n\n# \u793a\u4f8b\u4f7f\u7528\ntext_chunks = split_text_by_paragraphs(pdf_text, chunk_size=300)\nprint(f&quot;\u5206\u5272\u540e\u7684\u5757\u6570: {len(text_chunks)}&quot;)\nprint(text_chunks[:3])  # \u6253\u5370\u524d 3 \u4e2a\u5757<\/code><\/pre>\n<hr \/>\n<h3>3. <strong>\u6ed1\u52a8\u7a97\u53e3\u6cd5<\/strong><\/h3>\n<h4>\u65b9\u6cd5<\/h4>\n<p>\u4f7f\u7528\u6ed1\u52a8\u7a97\u53e3\u6280\u672f\u521b\u5efa\u6709\u91cd\u53e0\u7684\u5757\uff0c\u4fdd\u8bc1\u67e5\u8be2\u65f6\u7684\u4e0a\u4e0b\u6587\u4fe1\u606f\u5b8c\u6574\u6027\u3002<\/p>\n<h4>\u5b9e\u73b0\u793a\u4f8b<\/h4>\n<pre><code class=\"language-python\">def sliding_window_split(text, chunk_size=300, overlap=50):\n    chunks = []\n    for i in range(0, len(text), chunk_size - overlap):\n        chunk = text[i:i + chunk_size]\n        chunks.append(chunk.strip())\n    return chunks\n\n# \u793a\u4f8b\u4f7f\u7528\ntext_chunks = sliding_window_split(pdf_text, chunk_size=300, overlap=50)\nprint(f&quot;\u5206\u5272\u540e\u7684\u5757\u6570: {len(text_chunks)}&quot;)\nprint(text_chunks[:3])  # \u6253\u5370\u524d 3 \u4e2a\u5757<\/code><\/pre>\n<hr \/>\n<h3>4. <strong>\u57fa\u4e8e\u6587\u672c\u5206\u5c42\u5206\u5272<\/strong><\/h3>\n<h4>\u65b9\u6cd5<\/h4>\n<p>\u5982\u679c\u6587\u672c\u683c\u5f0f\u590d\u6742\uff08\u5982\u6280\u672f\u6587\u6863\u3001\u8bba\u6587\uff09\uff0c\u53ef\u4ee5\u7ed3\u5408\u6807\u9898\u3001\u7ae0\u8282\u7b49\u7ed3\u6784\u4fe1\u606f\uff0c\u4f18\u5148\u6309\u7ed3\u6784\u5206\u5272\u3002<\/p>\n<h4>\u793a\u4f8b<\/h4>\n<pre><code class=\"language-python\">def split_text_by_structure(text, chunk_size=300):\n    # \u5047\u8bbe\u6807\u9898\u4ee5 &quot;## &quot; \u5f00\u5934\n    sections = text.split(&quot;## &quot;)\n    chunks = []\n\n    for section in sections:\n        if len(section) &gt; chunk_size:\n            chunks.extend(sliding_window_split(section, chunk_size=chunk_size, overlap=50))\n        else:\n            chunks.append(section.strip())\n\n    return chunks\n\n# \u793a\u4f8b\u4f7f\u7528\nstructured_text = &quot;## Introduction\\nThis is the intro.\\n## Methodology\\nThis is the method section.&quot;\ntext_chunks = split_text_by_structure(structured_text, chunk_size=100)\nprint(f&quot;\u5206\u5272\u540e\u7684\u5757\u6570: {len(text_chunks)}&quot;)\nprint(text_chunks)  # \u6253\u5370\u5206\u5272\u7ed3\u679c<\/code><\/pre>\n<hr \/>\n<h3>5. <strong>\u7ed3\u5408 NLP \u5de5\u5177\u7684\u667a\u80fd\u5206\u5272<\/strong><\/h3>\n<h4>\u65b9\u6cd5<\/h4>\n<p>\u4f7f\u7528 NLP \u5de5\u5177\uff08\u5982 spaCy\uff09\u6309\u8bed\u4e49\u5206\u5272\u6587\u672c\uff0c\u4f8b\u5982\u6309\u53e5\u5b50\u6216\u8bed\u4e49\u6bb5\u843d\u8fdb\u884c\u5206\u5272\u3002<\/p>\n<h4>\u5b9e\u73b0\u793a\u4f8b<\/h4>\n<pre><code class=\"language-python\">import spacy\n\n# \u52a0\u8f7d spaCy \u6a21\u578b\nnlp = spacy.load(&quot;en_core_web_sm&quot;)\n\ndef split_text_by_semantics(text, chunk_size=300):\n    doc = nlp(text)\n    chunks = []\n    current_chunk = &quot;&quot;\n\n    for sentence in doc.sents:\n        if len(current_chunk) + len(sentence.text) &gt; chunk_size:\n            chunks.append(current_chunk.strip())\n            current_chunk = sentence.text\n        else:\n            current_chunk += &quot; &quot; + sentence.text\n\n    if current_chunk:\n        chunks.append(current_chunk.strip())\n\n    return chunks\n\n# \u793a\u4f8b\u4f7f\u7528\ntext_chunks = split_text_by_semantics(pdf_text, chunk_size=300)\nprint(f&quot;\u5206\u5272\u540e\u7684\u5757\u6570: {len(text_chunks)}&quot;)\nprint(text_chunks[:3])  # \u6253\u5370\u524d 3 \u4e2a\u5757<\/code><\/pre>\n<hr \/>\n<h3>\u6bd4\u8f83\u4e0e\u9009\u62e9<\/h3>\n<table>\n<thead>\n<tr>\n<th>\u65b9\u6cd5<\/th>\n<th>\u4f18\u70b9<\/th>\n<th>\u7f3a\u70b9<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><strong>\u53e5\u5b50\u5206\u5272<\/strong><\/td>\n<td>\u7b80\u5355\u76f4\u89c2\uff0c\u8bed\u4e49\u5b8c\u6574<\/td>\n<td>\u5bf9\u957f\u53e5\u5b50\u53ef\u80fd\u4e0d\u591f\u7075\u6d3b<\/td>\n<\/tr>\n<tr>\n<td><strong>\u6bb5\u843d\u5206\u5272<\/strong><\/td>\n<td>\u4fdd\u7559\u6bb5\u843d\u8bed\u4e49<\/td>\n<td>\u6bb5\u843d\u8fc7\u957f\u65f6\u9700\u8fdb\u4e00\u6b65\u5206\u5272<\/td>\n<\/tr>\n<tr>\n<td><strong>\u6ed1\u52a8\u7a97\u53e3\u6cd5<\/strong><\/td>\n<td>\u4fdd\u8bc1\u4e0a\u4e0b\u6587\u4fe1\u606f\u5b8c\u6574<\/td>\n<td>\u5b58\u5728\u91cd\u590d\u4fe1\u606f\uff0c\u53ef\u80fd\u589e\u52a0\u5b58\u50a8\u6210\u672c<\/td>\n<\/tr>\n<tr>\n<td><strong>\u6587\u672c\u5206\u5c42\u5206\u5272<\/strong><\/td>\n<td>\u9002\u5408\u7ed3\u6784\u5316\u6587\u672c<\/td>\n<td>\u4f9d\u8d56\u6587\u672c\u4e2d\u6709\u660e\u663e\u7ed3\u6784\u6807\u8bb0<\/td>\n<\/tr>\n<tr>\n<td><strong>NLP \u667a\u80fd\u5206\u5272<\/strong><\/td>\n<td>\u7ed3\u5408\u8bed\u4e49\u5206\u6790\uff0c\u6548\u679c\u66f4\u667a\u80fd<\/td>\n<td>\u9700\u8981\u52a0\u8f7d NLP \u6a21\u578b\uff0c\u901f\u5ea6\u8f83\u6162<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<hr \/>\n<h3>\u6700\u4f73\u5b9e\u8df5<\/h3>\n<ol>\n<li>\u5982\u679c\u6587\u672c\u8bed\u4e49\u7ed3\u6784\u8f83\u597d\uff08\u5982\u6280\u672f\u6587\u6863\u3001\u8bba\u6587\uff09\uff0c<strong>\u4f18\u5148\u57fa\u4e8e\u6bb5\u843d\u6216\u7ed3\u6784\u5206\u5272<\/strong>\u3002<\/li>\n<li>\u5982\u679c\u9700\u8981\u7cbe\u786e\u68c0\u7d22\u4e0a\u4e0b\u6587\uff0c<strong>\u6ed1\u52a8\u7a97\u53e3\u6cd5<\/strong> \u662f\u5f88\u597d\u7684\u9009\u62e9\u3002<\/li>\n<li>\u5bf9\u901a\u7528\u573a\u666f\uff0c<strong>\u7ed3\u5408 NLP \u5de5\u5177\u7684\u667a\u80fd\u5206\u5272<\/strong> \u662f\u6700\u5f3a\u5927\u7684\u65b9\u6cd5\u3002<\/li>\n<\/ol>\n<p>\u5efa\u8bae\u6839\u636e\u5177\u4f53\u7684\u6587\u672c\u5185\u5bb9\u9009\u62e9\u5408\u9002\u7684\u5206\u5272\u65b9\u6cd5\uff0c\u540c\u65f6\u53ef\u4ee5\u7ed3\u5408\u5206\u5272\u65b9\u6cd5\u4f18\u5316 RAG \u7cfb\u7edf\u7684\u6027\u80fd\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u957f\u6587\u672c\u5206\u5272\u662f\u6784\u5efa\u9ad8\u6548\u68c0\u7d22\u7cfb\u7edf\u7684\u91cd\u8981\u6b65\u9aa4\uff0c\u597d\u7684\u5206\u5272\u65b9\u6cd5\u9700\u8981\u517c\u987e\u8bed\u4e49\u5b8c\u6574\u6027\u548c\u5757\u5927\u5c0f\u9002\u4e2d\u3002\u6211\u4eec\u8981\u907f\u514d\u7b80\u5355\u7684\u56fa\u5b9a\u957f\u5ea6\u5207\u5206\u5bfc\u81f4\u8bed\u4e49\u4e22\u5931\u7684\u95ee\u9898\uff1a 1. \u57fa\u4e8e\u53e5\u5b50\u5206\u5272 \u65b9\u6cd5 \u4f7f\u7528\u81ea\u7136\u8bed\u8a00\u5904\u7406\u5de5\u5177\u5c06\u6587\u672c\u5206\u5272\u6210\u53e5\u5b50\uff0c\u7136\u540e\u518d\u7ec4\u5408\u6210\u9002\u5f53\u5927\u5c0f\u7684\u5757\u3002 \u5b9e\u73b0\u793a\u4f8b import nltk from nltk.tokenize import sent_tokenize # \u4e0b\u8f7d punkt \u5206\u8bcd\u5668\uff08\u9996\u6b21\u8fd0\u884c\u9700\u8981\uff09 nltk.download(&#039;punkt&#039;) def split_text_by_sentences(text, chunk_size=300): sentences = sent_tokenize(text) chunks = [] current_chunk = &quot;&quot; for sentence in sentences: # \u5982\u679c\u5f53\u524d\u5757\u52a0\u4e0a\u65b0\u53e5\u5b50\u7684\u957f\u5ea6\u8d85\u8fc7 chunk_size\uff0c\u5219\u5f00\u59cb\u65b0\u5757 if len(current_chunk) + len(sentence) &gt; chunk_size: chunks.append(current_chunk.strip()) current_chunk = sentence else: current_chunk += &quot; &quot; + sentence # \u6dfb\u52a0\u6700\u540e\u4e00\u4e2a\u5757 [&hellip;] <a class=\"read-more\" href=\"https:\/\/www.fanyamin.com\/wordpress\/?p=1745\" title=\"Permanent Link to: \u6587\u672c\u5206\u5272\u7684\u65b9\u6cd5\">&rarr;Read&nbsp;more<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-1745","post","type-post","status-publish","format-standard","hentry","category-5"],"_links":{"self":[{"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/1745"}],"collection":[{"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1745"}],"version-history":[{"count":1,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/1745\/revisions"}],"predecessor-version":[{"id":1746,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/1745\/revisions\/1746"}],"wp:attachment":[{"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1745"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1745"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.fanyamin.com\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1745"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}