diff --git a/README.md b/README.md index b6583cb..0c3cdf9 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,13 @@ The following options are common between both the `archive` and `download` comma The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. +- `--exclude-id` + - This will skip the download of any submission with the ID provided + - Can be specified multiple times +- `--exclude-id-file` + - This will skip the download of any submission with any of the IDs in the files provided + - Can be specified multiple times + - Format is one ID per line - `--make-hard-links` - This flag will create hard links to an existing file when a duplicate is downloaded - This will make the file appear in multiple directories while only taking the space of a single instance diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 78932b7..a3574e1 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -43,11 +43,13 @@ def cli(): @cli.command('download') +@click.option('--exclude-id', default=None, multiple=True) +@click.option('--exclude-id-file', default=None, multiple=True) +@click.option('--file-scheme', default=None, type=str) +@click.option('--folder-scheme', default=None, type=str) @click.option('--make-hard-links', is_flag=True, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) -@click.option('--file-scheme', default=None, type=str) -@click.option('--folder-scheme', default=None, type=str) @click.option('--skip', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @_add_common_options diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 1227590..5cb23b3 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -13,6 +13,8 @@ class Configuration(Namespace): self.authenticate = False self.config = None self.directory: str = '.' + self.exclude_id = [] + self.exclude_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.multireddit: list[str] = [] diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 03c468c..58efeb5 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -82,6 +82,8 @@ class RedditDownloader: self._create_reddit_instance() self._resolve_user_name() + self.excluded_submission_ids = self._read_excluded_ids() + if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) else: @@ -323,8 +325,12 @@ class RedditDownloader: def download(self): for generator in self.reddit_lists: for submission in generator: - logger.debug(f'Attempting to download submission {submission.id}') - self._download_submission(submission) + if submission.id in self.excluded_submission_ids: + logger.debug(f'Submission {submission.id} in exclusion list, skipping') + continue + else: + logger.debug(f'Attempting to download submission {submission.id}') + self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): if not isinstance(submission, praw.models.Submission): @@ -354,13 +360,15 @@ class RedditDownloader: res.download() except errors.BulkDownloaderException: logger.error( - f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') + f'Failed to download resource {res.url} with downloader {downloader_class.__name__}') return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: - logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere') + logger.warning( + f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}' + ' downloaded elsewhere') return elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to(destination) @@ -387,3 +395,16 @@ class RedditDownloader: hash_list = {res[1]: res[0] for res in results} return hash_list + + def _read_excluded_ids(self) -> set[str]: + out = [] + out.extend(self.args.exclude_id) + for id_file in self.args.exclude_id_file: + id_file = Path(id_file).resolve().expanduser() + if not id_file.exists(): + logger.error(f'ID exclusion file at {id_file} does not exist') + continue + with open(id_file, 'r') as file: + for line in file: + out.append(line.strip()) + return set(out) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 3d1bba7..1d43521 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -394,3 +394,29 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta assert test_file_1_stats.st_nlink == 2 assert test_file_1_stats.st_ino == test_file_2_inode + + +@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), ( + (('aaaaaa',), (), 1), + (('aaaaaa',), ('aaaaaa',), 0), + ((), ('aaaaaa',), 0), + (('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1), +)) +def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock): + downloader_mock.excluded_submission_ids = test_excluded + test_submissions = [] + for test_id in test_ids: + m = MagicMock() + m.id = test_id + test_submissions.append(m) + downloader_mock.reddit_lists = [test_submissions] + RedditDownloader.download(downloader_mock) + assert downloader_mock._download_submission.call_count == expected_len + + +def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): + test_file = tmp_path / 'test.txt' + test_file.write_text('aaaaaa\nbbbbbb') + downloader_mock.args.exclude_id_file = [test_file] + results = RedditDownloader._read_excluded_ids(downloader_mock) + assert results == {'aaaaaa', 'bbbbbb'} diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index a50d7f7..4daebad 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -239,3 +239,18 @@ def test_cli_download_use_default_config(tmp_path: Path): test_args = ['download', '-vv', str(tmp_path)] result = runner.invoke(cli, test_args) assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--exclude-id', 'm2601g'], +)) +def test_cli_download_links(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'in exclusion list' in result.output + assert 'Downloaded submission ' not in result.output