Add option to exclude submission IDs (#220)

* Add option to exclude submission IDs * Update README * Update logging message
2021-03-27 17:58:43 +10:00
parent 56347da07e
commit 21e8f0f8b9
6 changed files with 79 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -102,6 +102,13 @@ The following options are common between both the `archive` and `download` comma

 The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory.

+- `--exclude-id`
+  - This will skip the download of any submission with the ID provided
+  - Can be specified multiple times
+- `--exclude-id-file`
+  - This will skip the download of any submission with any of the IDs in the files provided
+  - Can be specified multiple times
+  - Format is one ID per line
 - `--make-hard-links`
  - This flag will create hard links to an existing file when a duplicate is downloaded
  - This will make the file appear in multiple directories while only taking the space of a single instance
--- a/bulkredditdownloader/main.py
+++ b/bulkredditdownloader/main.py
@@ -43,11 +43,13 @@ def cli():


@cli.command('download')
+@click.option('--exclude-id', default=None, multiple=True)
+@click.option('--exclude-id-file', default=None, multiple=True)
+@click.option('--file-scheme', default=None, type=str)
+@click.option('--folder-scheme', default=None, type=str)
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
-@click.option('--file-scheme', default=None, type=str)
-@click.option('--folder-scheme', default=None, type=str)
@click.option('--skip', default=None, multiple=True)
@click.option('--skip-domain', default=None, multiple=True)
@_add_common_options
--- a/bulkredditdownloader/configuration.py
+++ b/bulkredditdownloader/configuration.py
@@ -13,6 +13,8 @@ class Configuration(Namespace):
        self.authenticate = False
        self.config = None
        self.directory: str = '.'
+        self.exclude_id = []
+        self.exclude_id_file = []
        self.limit: Optional[int] = None
        self.link: list[str] = []
        self.multireddit: list[str] = []
--- a/bulkredditdownloader/downloader.py
+++ b/bulkredditdownloader/downloader.py
@@ -82,6 +82,8 @@ class RedditDownloader:
        self._create_reddit_instance()
        self._resolve_user_name()

+        self.excluded_submission_ids = self._read_excluded_ids()
+
        if self.args.search_existing:
            self.master_hash_list = self.scan_existing_files(self.download_directory)
        else:
@@ -323,8 +325,12 @@ class RedditDownloader:
    def download(self):
        for generator in self.reddit_lists:
            for submission in generator:
-                logger.debug(f'Attempting to download submission {submission.id}')
-                self._download_submission(submission)
+                if submission.id in self.excluded_submission_ids:
+                    logger.debug(f'Submission {submission.id} in exclusion list, skipping')
+                    continue
+                else:
+                    logger.debug(f'Attempting to download submission {submission.id}')
+                    self._download_submission(submission)

    def _download_submission(self, submission: praw.models.Submission):
        if not isinstance(submission, praw.models.Submission):
@@ -354,13 +360,15 @@ class RedditDownloader:
                    res.download()
                except errors.BulkDownloaderException:
                    logger.error(
-                        f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
+                        f'Failed to download resource {res.url} with downloader {downloader_class.__name__}')
                    return
                resource_hash = res.hash.hexdigest()
                destination.parent.mkdir(parents=True, exist_ok=True)
                if resource_hash in self.master_hash_list:
                    if self.args.no_dupes:
-                        logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
+                        logger.warning(
+                            f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}'
+                            ' downloaded elsewhere')
                        return
                    elif self.args.make_hard_links:
                        self.master_hash_list[resource_hash].link_to(destination)
@@ -387,3 +395,16 @@ class RedditDownloader:

        hash_list = {res[1]: res[0] for res in results}
        return hash_list
+
+    def _read_excluded_ids(self) -> set[str]:
+        out = []
+        out.extend(self.args.exclude_id)
+        for id_file in self.args.exclude_id_file:
+            id_file = Path(id_file).resolve().expanduser()
+            if not id_file.exists():
+                logger.error(f'ID exclusion file at {id_file} does not exist')
+                continue
+            with open(id_file, 'r') as file:
+                for line in file:
+                    out.append(line.strip())
+        return set(out)
--- a/bulkredditdownloader/tests/test_downloader.py
+++ b/bulkredditdownloader/tests/test_downloader.py
@@ -394,3 +394,29 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta

    assert test_file_1_stats.st_nlink == 2
    assert test_file_1_stats.st_ino == test_file_2_inode
+
+
+@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), (
+    (('aaaaaa',), (), 1),
+    (('aaaaaa',), ('aaaaaa',), 0),
+    ((), ('aaaaaa',), 0),
+    (('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1),
+))
+def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock):
+    downloader_mock.excluded_submission_ids = test_excluded
+    test_submissions = []
+    for test_id in test_ids:
+        m = MagicMock()
+        m.id = test_id
+        test_submissions.append(m)
+    downloader_mock.reddit_lists = [test_submissions]
+    RedditDownloader.download(downloader_mock)
+    assert downloader_mock._download_submission.call_count == expected_len
+
+
+def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
+    test_file = tmp_path / 'test.txt'
+    test_file.write_text('aaaaaa\nbbbbbb')
+    downloader_mock.args.exclude_id_file = [test_file]
+    results = RedditDownloader._read_excluded_ids(downloader_mock)
+    assert results == {'aaaaaa', 'bbbbbb'}
--- a/bulkredditdownloader/tests/test_integration.py
+++ b/bulkredditdownloader/tests/test_integration.py
@@ -239,3 +239,18 @@ def test_cli_download_use_default_config(tmp_path: Path):
    test_args = ['download', '-vv', str(tmp_path)]
    result = runner.invoke(cli, test_args)
    assert result.exit_code == 0
+
+
+@pytest.mark.online
+@pytest.mark.reddit
+@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests')
+@pytest.mark.parametrize('test_args', (
+    ['-l', 'm2601g', '--exclude-id', 'm2601g'],
+))
+def test_cli_download_links(test_args: list[str], tmp_path: Path):
+    runner = CliRunner()
+    test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
+    result = runner.invoke(cli, test_args)
+    assert result.exit_code == 0
+    assert 'in exclusion list' in result.output
+    assert 'Downloaded submission ' not in result.output