Calculate existing hashes in parallel
This commit is contained in:
@@ -11,6 +11,7 @@ import shutil
|
|||||||
import socket
|
import socket
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from multiprocessing import Pool
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
|
|
||||||
@@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _calc_hash(existing_file: Path):
|
||||||
|
with open(existing_file, 'rb') as file:
|
||||||
|
file_hash = hashlib.md5(file.read()).hexdigest()
|
||||||
|
return existing_file, file_hash
|
||||||
|
|
||||||
|
|
||||||
class RedditTypes:
|
class RedditTypes:
|
||||||
class SortType(Enum):
|
class SortType(Enum):
|
||||||
HOT = auto()
|
HOT = auto()
|
||||||
@@ -373,9 +380,10 @@ class RedditDownloader:
|
|||||||
for (dirpath, dirnames, filenames) in os.walk(directory):
|
for (dirpath, dirnames, filenames) in os.walk(directory):
|
||||||
files.extend([Path(dirpath, file) for file in filenames])
|
files.extend([Path(dirpath, file) for file in filenames])
|
||||||
logger.info(f'Calculating hashes for {len(files)} files')
|
logger.info(f'Calculating hashes for {len(files)} files')
|
||||||
hash_list = {}
|
|
||||||
for existing_file in files:
|
pool = Pool(15)
|
||||||
with open(existing_file, 'rb') as file:
|
results = pool.map(_calc_hash, files)
|
||||||
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
|
pool.close()
|
||||||
logger.log(9, f'Hash calculated for file at {existing_file}')
|
|
||||||
|
hash_list = {res[1]: res[0] for res in results}
|
||||||
return hash_list
|
return hash_list
|
||||||
|
|||||||
Reference in New Issue
Block a user