Calculate existing hashes in parallel

This commit is contained in:
Serene-Arc
2021-03-26 10:42:51 +10:00
committed by Ali Parlakci
parent 79105f9f84
commit 771cc711e4

View File

@@ -11,6 +11,7 @@ import shutil
import socket import socket
from datetime import datetime from datetime import datetime
from enum import Enum, auto from enum import Enum, auto
from multiprocessing import Pool
from pathlib import Path from pathlib import Path
from typing import Iterator from typing import Iterator
@@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _calc_hash(existing_file: Path):
with open(existing_file, 'rb') as file:
file_hash = hashlib.md5(file.read()).hexdigest()
return existing_file, file_hash
class RedditTypes: class RedditTypes:
class SortType(Enum): class SortType(Enum):
HOT = auto() HOT = auto()
@@ -373,9 +380,10 @@ class RedditDownloader:
for (dirpath, dirnames, filenames) in os.walk(directory): for (dirpath, dirnames, filenames) in os.walk(directory):
files.extend([Path(dirpath, file) for file in filenames]) files.extend([Path(dirpath, file) for file in filenames])
logger.info(f'Calculating hashes for {len(files)} files') logger.info(f'Calculating hashes for {len(files)} files')
hash_list = {}
for existing_file in files: pool = Pool(15)
with open(existing_file, 'rb') as file: results = pool.map(_calc_hash, files)
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file pool.close()
logger.log(9, f'Hash calculated for file at {existing_file}')
hash_list = {res[1]: res[0] for res in results}
return hash_list return hash_list