Add option to exclude submission IDs (#220)
* Add option to exclude submission IDs * Update README * Update logging message
This commit is contained in:
@@ -102,6 +102,13 @@ The following options are common between both the `archive` and `download` comma
|
||||
|
||||
The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory.
|
||||
|
||||
- `--exclude-id`
|
||||
- This will skip the download of any submission with the ID provided
|
||||
- Can be specified multiple times
|
||||
- `--exclude-id-file`
|
||||
- This will skip the download of any submission with any of the IDs in the files provided
|
||||
- Can be specified multiple times
|
||||
- Format is one ID per line
|
||||
- `--make-hard-links`
|
||||
- This flag will create hard links to an existing file when a duplicate is downloaded
|
||||
- This will make the file appear in multiple directories while only taking the space of a single instance
|
||||
|
||||
@@ -43,11 +43,13 @@ def cli():
|
||||
|
||||
|
||||
@cli.command('download')
|
||||
@click.option('--exclude-id', default=None, multiple=True)
|
||||
@click.option('--exclude-id-file', default=None, multiple=True)
|
||||
@click.option('--file-scheme', default=None, type=str)
|
||||
@click.option('--folder-scheme', default=None, type=str)
|
||||
@click.option('--make-hard-links', is_flag=True, default=None)
|
||||
@click.option('--no-dupes', is_flag=True, default=None)
|
||||
@click.option('--search-existing', is_flag=True, default=None)
|
||||
@click.option('--file-scheme', default=None, type=str)
|
||||
@click.option('--folder-scheme', default=None, type=str)
|
||||
@click.option('--skip', default=None, multiple=True)
|
||||
@click.option('--skip-domain', default=None, multiple=True)
|
||||
@_add_common_options
|
||||
|
||||
@@ -13,6 +13,8 @@ class Configuration(Namespace):
|
||||
self.authenticate = False
|
||||
self.config = None
|
||||
self.directory: str = '.'
|
||||
self.exclude_id = []
|
||||
self.exclude_id_file = []
|
||||
self.limit: Optional[int] = None
|
||||
self.link: list[str] = []
|
||||
self.multireddit: list[str] = []
|
||||
|
||||
@@ -82,6 +82,8 @@ class RedditDownloader:
|
||||
self._create_reddit_instance()
|
||||
self._resolve_user_name()
|
||||
|
||||
self.excluded_submission_ids = self._read_excluded_ids()
|
||||
|
||||
if self.args.search_existing:
|
||||
self.master_hash_list = self.scan_existing_files(self.download_directory)
|
||||
else:
|
||||
@@ -323,8 +325,12 @@ class RedditDownloader:
|
||||
def download(self):
|
||||
for generator in self.reddit_lists:
|
||||
for submission in generator:
|
||||
logger.debug(f'Attempting to download submission {submission.id}')
|
||||
self._download_submission(submission)
|
||||
if submission.id in self.excluded_submission_ids:
|
||||
logger.debug(f'Submission {submission.id} in exclusion list, skipping')
|
||||
continue
|
||||
else:
|
||||
logger.debug(f'Attempting to download submission {submission.id}')
|
||||
self._download_submission(submission)
|
||||
|
||||
def _download_submission(self, submission: praw.models.Submission):
|
||||
if not isinstance(submission, praw.models.Submission):
|
||||
@@ -354,13 +360,15 @@ class RedditDownloader:
|
||||
res.download()
|
||||
except errors.BulkDownloaderException:
|
||||
logger.error(
|
||||
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
|
||||
f'Failed to download resource {res.url} with downloader {downloader_class.__name__}')
|
||||
return
|
||||
resource_hash = res.hash.hexdigest()
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
if resource_hash in self.master_hash_list:
|
||||
if self.args.no_dupes:
|
||||
logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
|
||||
logger.warning(
|
||||
f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}'
|
||||
' downloaded elsewhere')
|
||||
return
|
||||
elif self.args.make_hard_links:
|
||||
self.master_hash_list[resource_hash].link_to(destination)
|
||||
@@ -387,3 +395,16 @@ class RedditDownloader:
|
||||
|
||||
hash_list = {res[1]: res[0] for res in results}
|
||||
return hash_list
|
||||
|
||||
def _read_excluded_ids(self) -> set[str]:
|
||||
out = []
|
||||
out.extend(self.args.exclude_id)
|
||||
for id_file in self.args.exclude_id_file:
|
||||
id_file = Path(id_file).resolve().expanduser()
|
||||
if not id_file.exists():
|
||||
logger.error(f'ID exclusion file at {id_file} does not exist')
|
||||
continue
|
||||
with open(id_file, 'r') as file:
|
||||
for line in file:
|
||||
out.append(line.strip())
|
||||
return set(out)
|
||||
|
||||
@@ -394,3 +394,29 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta
|
||||
|
||||
assert test_file_1_stats.st_nlink == 2
|
||||
assert test_file_1_stats.st_ino == test_file_2_inode
|
||||
|
||||
|
||||
@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), (
|
||||
(('aaaaaa',), (), 1),
|
||||
(('aaaaaa',), ('aaaaaa',), 0),
|
||||
((), ('aaaaaa',), 0),
|
||||
(('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1),
|
||||
))
|
||||
def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock):
|
||||
downloader_mock.excluded_submission_ids = test_excluded
|
||||
test_submissions = []
|
||||
for test_id in test_ids:
|
||||
m = MagicMock()
|
||||
m.id = test_id
|
||||
test_submissions.append(m)
|
||||
downloader_mock.reddit_lists = [test_submissions]
|
||||
RedditDownloader.download(downloader_mock)
|
||||
assert downloader_mock._download_submission.call_count == expected_len
|
||||
|
||||
|
||||
def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
|
||||
test_file = tmp_path / 'test.txt'
|
||||
test_file.write_text('aaaaaa\nbbbbbb')
|
||||
downloader_mock.args.exclude_id_file = [test_file]
|
||||
results = RedditDownloader._read_excluded_ids(downloader_mock)
|
||||
assert results == {'aaaaaa', 'bbbbbb'}
|
||||
|
||||
@@ -239,3 +239,18 @@ def test_cli_download_use_default_config(tmp_path: Path):
|
||||
test_args = ['download', '-vv', str(tmp_path)]
|
||||
result = runner.invoke(cli, test_args)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests')
|
||||
@pytest.mark.parametrize('test_args', (
|
||||
['-l', 'm2601g', '--exclude-id', 'm2601g'],
|
||||
))
|
||||
def test_cli_download_links(test_args: list[str], tmp_path: Path):
|
||||
runner = CliRunner()
|
||||
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
|
||||
result = runner.invoke(cli, test_args)
|
||||
assert result.exit_code == 0
|
||||
assert 'in exclusion list' in result.output
|
||||
assert 'Downloaded submission ' not in result.output
|
||||
|
||||
Reference in New Issue
Block a user