import{_ as s}from"./plugin-vue_export-helper-DlAUqK2U.js";import{c as n,o as e,a}from"./app-CQ4eKaQa.js";const l={},o=a(`<h2 id="获取重复的文件" tabindex="-1"><a class="header-anchor" href="#获取重复的文件"><span>获取重复的文件</span></a></h2><h2 id="背景和环境说明" tabindex="-1"><a class="header-anchor" href="#背景和环境说明"><span>背景和环境说明</span></a></h2><blockquote><p>本程序使用 <code>python 3.10+</code><br> 机器学习，样本处理过程中会有重复的数据。这里的样本是<code>图片</code>，本程序根据文件<code>MD5</code> 获取重复文件。</p></blockquote><h2 id="依赖包下载" tabindex="-1"><a class="header-anchor" href="#依赖包下载"><span>依赖包下载</span></a></h2><blockquote><p><code>pip install -U NStudyPy</code></p></blockquote><h2 id="使用" tabindex="-1"><a class="header-anchor" href="#使用"><span>使用</span></a></h2><div class="language-python line-numbers-mode" data-highlighter="shiki" data-ext="python" data-title="python" style="background-color:#1E1E1E;color:#D4D4D4;"><pre class="shiki dark-plus vp-code"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> NStudyPy </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> PyFile</span></span>
<span class="line"></span>
<span class="line"><span style="color:#C586C0;">if</span><span style="color:#9CDCFE;"> __name__</span><span style="color:#D4D4D4;"> == </span><span style="color:#CE9178;">&#39;__main__&#39;</span><span style="color:#D4D4D4;">:</span></span>
<span class="line"><span style="color:#D4D4D4;">    PyFile.get_repeat_file(</span><span style="color:#569CD6;">r</span><span style="color:#D16969;">&#39;F:</span><span style="color:#D7BA7D;">\\t</span><span style="color:#D16969;">emp</span><span style="color:#D7BA7D;">\\c</span><span style="color:#D16969;">ards&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">is_recursive</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><blockquote><p><code>is_recursive</code> 参数来限定是否递归子目录</p></blockquote><h2 id="核心源码" tabindex="-1"><a class="header-anchor" href="#核心源码"><span>核心源码</span></a></h2><div class="language-python line-numbers-mode" data-highlighter="shiki" data-ext="python" data-title="python" style="background-color:#1E1E1E;color:#D4D4D4;"><pre class="shiki dark-plus vp-code"><code><span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> get_repeat_file</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">path</span><span style="color:#D4D4D4;">: </span><span style="color:#4EC9B0;">str</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">is_recursive</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">) -&gt; </span><span style="color:#4EC9B0;">dict</span><span style="color:#D4D4D4;">:</span></span>
<span class="line"><span style="color:#CE9178;">    &quot;&quot;&quot;</span></span>
<span class="line"><span style="color:#CE9178;">    获取重复文件</span></span>
<span class="line"><span style="color:#CE9178;">    :param path: 路径</span></span>
<span class="line"><span style="color:#CE9178;">    :param is_recursive:  是否递归</span></span>
<span class="line"><span style="color:#CE9178;">    :return: dict {&quot;md5&quot; : [file1,file2]}</span></span>
<span class="line"><span style="color:#CE9178;">    &quot;&quot;&quot;</span></span>
<span class="line"><span style="color:#D4D4D4;">    file_dict = </span><span style="color:#4EC9B0;">dict</span><span style="color:#D4D4D4;">()</span></span>
<span class="line"><span style="color:#C586C0;">    for</span><span style="color:#D4D4D4;"> f </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> get_file_list(path, is_recursive):</span></span>
<span class="line"><span style="color:#D4D4D4;">        md5 = get_md5(f)</span></span>
<span class="line"><span style="color:#C586C0;">        if</span><span style="color:#D4D4D4;"> md5 </span><span style="color:#569CD6;">not</span><span style="color:#569CD6;"> in</span><span style="color:#D4D4D4;"> file_dict:</span></span>
<span class="line"><span style="color:#D4D4D4;">            file_dict[md5] = [f]</span></span>
<span class="line"><span style="color:#C586C0;">        else</span><span style="color:#D4D4D4;">:</span></span>
<span class="line"><span style="color:#D4D4D4;">            file_dict[md5].append(f)</span></span>
<span class="line"><span style="color:#D4D4D4;">    repeat_files = </span><span style="color:#4EC9B0;">dict</span><span style="color:#D4D4D4;">()</span></span>
<span class="line"><span style="color:#C586C0;">    for</span><span style="color:#D4D4D4;"> k, v </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> file_dict.items():</span></span>
<span class="line"><span style="color:#C586C0;">        if</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(v) &gt; </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:</span></span>
<span class="line"><span style="color:#D4D4D4;">            repeat_files.update({k: v})</span></span>
<span class="line"><span style="color:#DCDCAA;">            print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">k</span><span style="color:#569CD6;">}</span><span style="color:#569CD6;"> {</span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(v)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
<span class="line"><span style="color:#6A9955;">    # return list(filter(lambda x: len(x) &gt; 1, file_dict.values()))</span></span>
<span class="line"><span style="color:#C586C0;">    return</span><span style="color:#D4D4D4;"> repeat_files</span></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><blockquote><p><code>get_file_list</code> 获取文件 , 参加下篇随笔来介绍。<br><code>get_md5</code> 获取文件MD5 , 参加下篇随笔来介绍。</p></blockquote>`,11),p=[o];function t(c,r){return e(),n("div",null,p)}const D=s(l,[["render",t],["__file","3be333.html.vue"]]),y=JSON.parse(`{"path":"/p2024/01/03/3be333.html","title":"获取重复的文件","lang":"zh-CN","frontmatter":{"permalink":"/p2024/01/03/3be333.html","lang":"zh-CN","title":"获取重复的文件","description":"获取重复的文件","isOriginal":true,"date":"2024-06-03T00:00:00.000Z","category":["Python3"],"tag":["Python3"],"head":[["meta",{"name":"keywords","content":"获取重复的文件,Python3"}],["meta",{"property":"og:url","content":"https://nstudy.org/p2024/01/03/3be333.html"}],["meta",{"property":"og:site_name","content":"Jack's 新学习基地"}],["meta",{"property":"og:title","content":"获取重复的文件"}],["meta",{"property":"og:description","content":"获取重复的文件"}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"zh-CN"}],["meta",{"property":"og:updated_time","content":"2024-10-26T07:22:00.000Z"}],["meta",{"property":"article:author","content":"Jack"}],["meta",{"property":"article:tag","content":"Python3"}],["meta",{"property":"article:published_time","content":"2024-06-03T00:00:00.000Z"}],["meta",{"property":"article:modified_time","content":"2024-10-26T07:22:00.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"获取重复的文件\\",\\"image\\":[\\"\\"],\\"datePublished\\":\\"2024-06-03T00:00:00.000Z\\",\\"dateModified\\":\\"2024-10-26T07:22:00.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Jack\\",\\"url\\":\\"https://nstudy.org\\"}]}"]],"sticky":true},"headers":[{"level":2,"title":"获取重复的文件","slug":"获取重复的文件","link":"#获取重复的文件","children":[]},{"level":2,"title":"背景和环境说明","slug":"背景和环境说明","link":"#背景和环境说明","children":[]},{"level":2,"title":"依赖包下载","slug":"依赖包下载","link":"#依赖包下载","children":[]},{"level":2,"title":"使用","slug":"使用","link":"#使用","children":[]},{"level":2,"title":"核心源码","slug":"核心源码","link":"#核心源码","children":[]}],"git":{"createdTime":1729927320000,"updatedTime":1729927320000,"contributors":[{"name":"lizhq","email":"lizhq08@163.com","commits":1}]},"readingTime":{"minutes":0.87,"words":260},"filePathRelative":"01.机器学习/03.工具/20240603_获取重复的文件.md","localizedDate":"2024年6月3日","excerpt":"<h2>获取重复的文件</h2>\\n<h2>背景和环境说明</h2>\\n<blockquote>\\n<p>本程序使用 <code>python 3.10+</code><br>\\n机器学习，样本处理过程中会有重复的数据。这里的样本是<code>图片</code>，本程序根据文件<code>MD5</code> 获取重复文件。</p>\\n</blockquote>\\n<h2>依赖包下载</h2>\\n<blockquote>\\n<p><code>pip install -U NStudyPy</code></p>\\n</blockquote>\\n<h2>使用</h2>\\n<div class=\\"language-python line-numbers-mode\\" data-highlighter=\\"shiki\\" data-ext=\\"python\\" data-title=\\"python\\" style=\\"background-color:#1E1E1E;color:#D4D4D4\\"><pre class=\\"shiki dark-plus vp-code\\"><code><span class=\\"line\\"><span style=\\"color:#C586C0\\">from</span><span style=\\"color:#D4D4D4\\"> NStudyPy </span><span style=\\"color:#C586C0\\">import</span><span style=\\"color:#D4D4D4\\"> PyFile</span></span>\\n<span class=\\"line\\"></span>\\n<span class=\\"line\\"><span style=\\"color:#C586C0\\">if</span><span style=\\"color:#9CDCFE\\"> __name__</span><span style=\\"color:#D4D4D4\\"> == </span><span style=\\"color:#CE9178\\">'__main__'</span><span style=\\"color:#D4D4D4\\">:</span></span>\\n<span class=\\"line\\"><span style=\\"color:#D4D4D4\\">    PyFile.get_repeat_file(</span><span style=\\"color:#569CD6\\">r</span><span style=\\"color:#D16969\\">'F:</span><span style=\\"color:#D7BA7D\\">\\\\t</span><span style=\\"color:#D16969\\">emp</span><span style=\\"color:#D7BA7D\\">\\\\c</span><span style=\\"color:#D16969\\">ards'</span><span style=\\"color:#D4D4D4\\">, </span><span style=\\"color:#9CDCFE\\">is_recursive</span><span style=\\"color:#D4D4D4\\">=</span><span style=\\"color:#569CD6\\">True</span><span style=\\"color:#D4D4D4\\">)</span></span></code></pre>\\n<div class=\\"line-numbers\\" aria-hidden=\\"true\\" style=\\"counter-reset:line-number 0\\"><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div></div></div>"}`);export{D as comp,y as data};
