Calculate existing hashes in parallel

2021-03-26 10:42:51 +10:00
parent 79105f9f84
commit 771cc711e4
1 changed files with 13 additions and 5 deletions
@@ -11,6 +11,7 @@ import shutil
 import socket
 from datetime import datetime
 from enum import Enum, auto
 from multiprocessing import Pool
 from pathlib import Path
 from typing import Iterator
@@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto
 logger = logging.getLogger(__name__)
 def _calc_hash(existing_file: Path):
    with open(existing_file, 'rb') as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
        return existing_file, file_hash
 class RedditTypes:
    class SortType(Enum):
        HOT = auto()
@@ -373,9 +380,10 @@ class RedditDownloader:
        for (dirpath, dirnames, filenames) in os.walk(directory):
            files.extend([Path(dirpath, file) for file in filenames])
        logger.info(f'Calculating hashes for {len(files)} files')
-        hash_list = {}
+
-        for existing_file in files:
+        pool = Pool(15)
-            with open(existing_file, 'rb') as file:
+        results = pool.map(_calc_hash, files)
-                hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
+        pool.close()
-                logger.log(9, f'Hash calculated for file at {existing_file}')
+
        hash_list = {res[1]: res[0] for res in results}
        return hash_list