diff --git a/bdfr/downloader.py b/bdfr/downloader.py index cc4e8bb..a262dae 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -22,13 +22,13 @@ logger = logging.getLogger(__name__) def _calc_hash(existing_file: Path): - CHUNK_SIZE = 1024 * 1024 + chunk_size = 1024 * 1024 md5_hash = hashlib.md5() with open(existing_file, 'rb') as file: - chunk = file.read(CHUNK_SIZE) + chunk = file.read(chunk_size) while chunk: md5_hash.update(chunk) - chunk = file.read(CHUNK_SIZE) + chunk = file.read(chunk_size) file_hash = md5_hash.hexdigest() return existing_file, file_hash @@ -94,9 +94,13 @@ class RedditDownloader(RedditConnector): f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}' f' in submission {submission.id}') return - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') + try: + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + except OSError as e: + logger.exception(e) + logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 7108592..89f1896 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -18,4 +18,5 @@ fi grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; } >>"$output"