feat(Hashing): added more hash checking for repeat download checks to avoid overlap

This commit is contained in:
2025-10-07 17:39:57 +13:00
parent 8c293a4684
commit 15f14088be
8 changed files with 584 additions and 10 deletions

View File

@@ -53,6 +53,7 @@ _downloader_options = [
click.option("--max-wait-time", type=int, default=None),
click.option("--no-dupes", is_flag=True, default=None),
click.option("--search-existing", is_flag=True, default=None),
click.option("--simple-check", is_flag=True, default=None, help="Enable fast URL-based duplicate checking (works with --no-dupes)"),
click.option("--skip", default=None, multiple=True),
click.option("--skip-domain", default=None, multiple=True),
click.option("--skip-subreddit", default=None, multiple=True),

View File

@@ -35,6 +35,7 @@ class Configuration(Namespace):
self.multireddit: list[str] = []
self.no_dupes: bool = False
self.saved: bool = False
self.simple_check: bool = False
self.search: Optional[str] = None
self.search_existing: bool = False
self.skip: list[str] = []

View File

@@ -5,3 +5,4 @@ scopes = identity, history, read, save, mysubreddits
backup_log_count = 3
max_wait_time = 120
time_format = ISO

View File

@@ -2,8 +2,10 @@
# -*- coding: utf-8 -*-
import hashlib
import json
import logging.handlers
import os
import tempfile
import time
from collections.abc import Iterable
from datetime import datetime
@@ -39,22 +41,60 @@ def _calc_hash(existing_file: Path):
class RedditDownloader(RedditConnector):
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
super(RedditDownloader, self).__init__(args, logging_handlers)
if self.args.search_existing:
self.master_hash_list = self.scan_existing_files(self.download_directory)
self.master_hash_list = {}
self.url_list = {} # New: Store URL to hash mapping for simple-check
# Load existing hashes if no_dupes is enabled or search_existing is requested
if self.args.no_dupes or self.args.search_existing:
# First try to load from persistent hash file
hash_data = self._load_hash_list()
# Handle both old and new hash file formats
if isinstance(hash_data, dict) and 'files' in hash_data:
# New format with enhanced structure
self.master_hash_list = {k: v['path'] for k, v in hash_data['files'].items()}
self.url_list = hash_data.get('urls', {})
logger.info(f"Loaded {len(self.master_hash_list)} hashes and {len(self.url_list)} URLs from enhanced hash file")
else:
# Old format - just hashes
self.master_hash_list = hash_data
logger.info(f"Loaded {len(self.master_hash_list)} hashes from legacy hash file")
# If search_existing is also enabled, scan for any new files not in hash list
if self.args.search_existing:
existing_hashes = set(self.master_hash_list.keys())
all_files_hashes = self.scan_existing_files(self.download_directory)
# Add any new files found by scanning
for hash_value, file_path in all_files_hashes.items():
if hash_value not in existing_hashes:
self.master_hash_list[hash_value] = file_path
logger.info(f"Loaded {len(self.master_hash_list)} total hashes "
f"({len(existing_hashes)} from file, {len(all_files_hashes) - len(existing_hashes)} new)")
def download(self):
for generator in self.reddit_lists:
last_submission_id = None
try:
for submission in generator:
last_submission_id = submission.id
try:
self._download_submission(submission)
except prawcore.PrawcoreException as e:
logger.error(f"Submission {submission.id} failed to download due to a PRAW exception: {e}")
except prawcore.PrawcoreException as e:
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
submission_id = last_submission_id or "unknown"
logger.error(f"The submission after {submission_id} failed to download due to a PRAW exception: {e}")
logger.debug("Waiting 60 seconds to continue")
sleep(60)
# Save hash list after completion if no_dupes is enabled
# Always save if no_dupes is enabled, even if hash list is empty
# This creates the hash file for future runs
if self.args.no_dupes:
self._save_hash_list()
def _download_submission(self, submission: praw.models.Submission):
if submission.id in self.excluded_submission_ids:
logger.debug(f"Object {submission.id} in exclusion list, skipping")
@@ -108,15 +148,36 @@ class RedditDownloader(RedditConnector):
except errors.SiteDownloaderError as e:
logger.error(f"Site {downloader_class.__name__} failed to download submission {submission.id}: {e}")
return
files_processed = 0
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
if destination.exists():
logger.debug(f"File {destination} from submission {submission.id} already exists, continuing")
continue
# Check if we already have this file's hash
if destination in self.master_hash_list.values():
logger.debug(f"File {destination} from submission {submission.id} already exists, continuing")
continue
else:
# File exists but not in our hash list - calculate its hash
try:
existing_file_hash = _calc_hash(destination)[1]
self.master_hash_list[existing_file_hash] = destination
# Store URL mapping for simple-check functionality if URL is available
if hasattr(res, 'url') and self.args.simple_check:
self.url_list[res.url] = existing_file_hash
logger.debug(f"Added hash for existing file: {existing_file_hash}")
files_processed += 1
if self.args.no_dupes:
self._save_hash_list()
except Exception as e:
logger.warning(f"Failed to calculate hash for existing file {destination}: {e}")
continue
elif not self.download_filter.check_resource(res):
logger.debug(f"Download filter removed {submission.id} file with URL {submission.url}")
continue
try:
res.download({"max_wait_time": self.args.max_wait_time})
print(f"DEBUG: Successfully downloaded resource {res.url}")
except errors.BulkDownloaderException as e:
logger.error(
f"Failed to download resource {res.url} in submission {submission.id} "
@@ -125,6 +186,15 @@ class RedditDownloader(RedditConnector):
return
resource_hash = res.hash.hexdigest()
destination.parent.mkdir(parents=True, exist_ok=True)
# Simple-check: URL-based duplicate detection (fast path)
if self.args.simple_check and hasattr(res, 'url') and res.url in self.url_list:
stored_hash = self.url_list[res.url]
if stored_hash in self.master_hash_list:
logger.info(f"URL {res.url} from submission {submission.id} already downloaded (simple-check)")
return
# Full hash-based duplicate detection
if resource_hash in self.master_hash_list:
if self.args.no_dupes:
logger.info(f"Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere")
@@ -138,11 +208,16 @@ class RedditDownloader(RedditConnector):
f"Hard link made linking {destination} to {self.master_hash_list[resource_hash]}"
f" in submission {submission.id}"
)
files_processed += 1
# Save hash list after successful hard link creation if no_dupes is enabled
if self.args.no_dupes:
self._save_hash_list()
return
try:
with destination.open("wb") as file:
file.write(res.content)
logger.debug(f"Written file to {destination}")
files_processed += 1
except OSError as e:
logger.exception(e)
logger.error(f"Failed to write file in submission {submission.id} to {destination}: {e}")
@@ -150,8 +225,23 @@ class RedditDownloader(RedditConnector):
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
os.utime(destination, (creation_time, creation_time))
self.master_hash_list[resource_hash] = destination
# Store URL mapping for simple-check functionality
if hasattr(res, 'url') and self.args.simple_check:
self.url_list[res.url] = resource_hash
logger.debug(f"Hash added to master list: {resource_hash}")
logger.info(f"Downloaded submission {submission.id} from {submission.subreddit.display_name}")
logger.debug(f"Master hash list now contains {len(self.master_hash_list)} entries")
# Save hash list after successful download if no_dupes is enabled
if self.args.no_dupes:
self._save_hash_list()
# Only log "Downloaded submission" if files were actually processed
if files_processed > 0:
logger.info(f"Downloaded submission {submission.id} from {submission.subreddit.display_name}")
else:
logger.info(f"Skipped submission {submission.id} from {submission.subreddit.display_name} (no new files)")
@staticmethod
def scan_existing_files(directory: Path) -> dict[str, Path]:
@@ -166,3 +256,152 @@ class RedditDownloader(RedditConnector):
hash_list = {res[1]: res[0] for res in results}
return hash_list
def get_master_hash_list(self) -> dict[str, Path]:
"""Get the current master hash list for testing purposes."""
return self.master_hash_list
def _load_hash_list(self) -> dict[str, Path]:
"""Load existing hash list from .bdfr_hashes.json in download directory."""
logger.debug(f"Loading hash list from directory: {self.download_directory}")
hash_file_path = self.download_directory / '.bdfr_hashes.json'
if not hash_file_path.exists():
logger.debug(f"No existing hash file found at {hash_file_path}")
return {}
try:
with open(hash_file_path, 'r', encoding='utf-8') as f:
hash_data = json.load(f)
if not isinstance(hash_data, dict):
logger.warning(f"Hash file {hash_file_path} contains invalid data format")
return {}
# Handle new enhanced format
if 'files' in hash_data and isinstance(hash_data['files'], dict):
# New format with enhanced structure
files_data = hash_data['files']
loaded_hashes = {}
urls_data = hash_data.get('urls', {})
for hash_value, file_info in files_data.items():
if isinstance(file_info, dict) and 'path' in file_info:
# New format: {"hash": {"path": "relative/path", "url": "http://..."}}
relative_path = file_info['path']
absolute_path = self.download_directory / relative_path
if absolute_path.exists():
loaded_hashes[hash_value] = absolute_path
else:
logger.debug(f"File {absolute_path} from hash file no longer exists")
# Load URL mapping for simple-check
if 'url' in file_info and file_info['url']:
self.url_list[file_info['url']] = hash_value
elif isinstance(file_info, str):
# Legacy format within new structure: {"hash": "relative/path"}
absolute_path = self.download_directory / file_info
if absolute_path.exists():
loaded_hashes[hash_value] = absolute_path
else:
logger.debug(f"File {absolute_path} from hash file no longer exists")
logger.info(f"Loaded {len(loaded_hashes)} hashes and {len(urls_data)} URLs from enhanced hash file")
return loaded_hashes
else:
# Legacy format: {"hash": "relative/path"}
loaded_hashes = {}
for hash_value, relative_path in hash_data.items():
absolute_path = self.download_directory / relative_path
if absolute_path.exists():
loaded_hashes[hash_value] = absolute_path
else:
logger.debug(f"File {absolute_path} from hash file no longer exists")
logger.info(f"Loaded {len(loaded_hashes)} hashes from legacy hash file")
return loaded_hashes
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse hash file {hash_file_path}: {e}")
return {}
except (OSError, IOError) as e:
logger.warning(f"Failed to read hash file {hash_file_path}: {e}")
return {}
def _save_hash_list(self) -> None:
"""Save current hash list to .bdfr_hashes.json in download directory using atomic write."""
hash_file_path = self.download_directory / '.bdfr_hashes.json'
# Build enhanced data structure for new format
if self.args.simple_check:
# New enhanced format with URLs and metadata
hash_data = {
'files': {},
'urls': self.url_list.copy(),
'metadata': {
'version': '2.0',
'created_with': 'simple_check' if self.args.simple_check else 'standard',
'url_count': len(self.url_list),
'hash_count': len(self.master_hash_list)
}
}
# Convert absolute paths to relative paths for portability
for hash_value, absolute_path in self.master_hash_list.items():
try:
relative_path = absolute_path.relative_to(self.download_directory)
hash_data['files'][hash_value] = {
'path': str(relative_path),
'url': next((url for url, h in self.url_list.items() if h == hash_value), None),
'check_method': 'hash'
}
except ValueError:
# File is not relative to download directory, skip it
logger.debug(f"Skipping file {absolute_path} as it's not relative to download directory")
continue
else:
# Legacy format for backward compatibility
hash_data = {}
for hash_value, absolute_path in self.master_hash_list.items():
try:
relative_path = absolute_path.relative_to(self.download_directory)
hash_data[hash_value] = str(relative_path)
except ValueError:
# File is not relative to download directory, skip it
logger.debug(f"Skipping file {absolute_path} as it's not relative to download directory")
continue
# Atomic write: write to temporary file first, then rename
try:
with tempfile.NamedTemporaryFile(
mode='w',
dir=self.download_directory,
suffix='.tmp',
delete=False,
encoding='utf-8'
) as temp_file:
json.dump(hash_data, temp_file, indent=2)
temp_file_path = temp_file.name
# Atomic rename
if os.name == 'nt': # Windows
# On Windows, we need to remove the target file first if it exists
if hash_file_path.exists():
hash_file_path.unlink()
os.rename(temp_file_path, hash_file_path)
else: # Unix-like systems
os.rename(temp_file_path, hash_file_path)
logger.debug(f"Saved {len(hash_data)} hashes to {hash_file_path}")
except (OSError, IOError) as e:
logger.error(f"Failed to save hash file {hash_file_path}: {e}")
except Exception as e:
logger.error(f"Unexpected error saving hash file {hash_file_path}: {e}")
# Clean up temp file if it still exists
try:
if 'temp_file_path' in locals():
os.unlink(temp_file_path)
except (OSError, IOError):
pass

View File

@@ -9,7 +9,9 @@ import socket
from pathlib import Path
import praw
import prawcore
import requests
from praw.util.token_manager import BaseTokenManager
from bdfr.exceptions import BulkDownloaderException, RedditAuthenticationError
@@ -87,13 +89,13 @@ class OAuth2Authenticator:
client.close()
class OAuth2TokenManager(praw.reddit.BaseTokenManager):
class OAuth2TokenManager(BaseTokenManager):
def __init__(self, config: configparser.ConfigParser, config_location: Path):
super(OAuth2TokenManager, self).__init__()
self.config = config
self.config_location = config_location
def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer):
def pre_refresh_callback(self, authorizer: prawcore.auth.BaseAuthorizer):
if authorizer.refresh_token is None:
if self.config.has_option("DEFAULT", "user_token"):
authorizer.refresh_token = self.config.get("DEFAULT", "user_token")
@@ -101,7 +103,7 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager):
else:
raise RedditAuthenticationError("No auth token loaded in configuration")
def post_refresh_callback(self, authorizer: praw.reddit.Authorizer):
def post_refresh_callback(self, authorizer: prawcore.auth.BaseAuthorizer):
self.config.set("DEFAULT", "user_token", authorizer.refresh_token)
with Path(self.config_location).open(mode="w") as file:
self.config.write(file, True)