Add option to exclude submission IDs (#220)

* Add option to exclude submission IDs

* Update README

* Update logging message
This commit is contained in:
Serene
2021-03-27 17:58:43 +10:00
committed by Ali Parlakci
parent 56347da07e
commit 21e8f0f8b9
6 changed files with 79 additions and 6 deletions

View File

@@ -102,6 +102,13 @@ The following options are common between both the `archive` and `download` comma
The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory.
- `--exclude-id`
- This will skip the download of any submission with the ID provided
- Can be specified multiple times
- `--exclude-id-file`
- This will skip the download of any submission with any of the IDs in the files provided
- Can be specified multiple times
- Format is one ID per line
- `--make-hard-links`
- This flag will create hard links to an existing file when a duplicate is downloaded
- This will make the file appear in multiple directories while only taking the space of a single instance

View File

@@ -43,11 +43,13 @@ def cli():
@cli.command('download')
@click.option('--exclude-id', default=None, multiple=True)
@click.option('--exclude-id-file', default=None, multiple=True)
@click.option('--file-scheme', default=None, type=str)
@click.option('--folder-scheme', default=None, type=str)
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
@click.option('--file-scheme', default=None, type=str)
@click.option('--folder-scheme', default=None, type=str)
@click.option('--skip', default=None, multiple=True)
@click.option('--skip-domain', default=None, multiple=True)
@_add_common_options

View File

@@ -13,6 +13,8 @@ class Configuration(Namespace):
self.authenticate = False
self.config = None
self.directory: str = '.'
self.exclude_id = []
self.exclude_id_file = []
self.limit: Optional[int] = None
self.link: list[str] = []
self.multireddit: list[str] = []

View File

@@ -82,6 +82,8 @@ class RedditDownloader:
self._create_reddit_instance()
self._resolve_user_name()
self.excluded_submission_ids = self._read_excluded_ids()
if self.args.search_existing:
self.master_hash_list = self.scan_existing_files(self.download_directory)
else:
@@ -323,8 +325,12 @@ class RedditDownloader:
def download(self):
for generator in self.reddit_lists:
for submission in generator:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
if submission.id in self.excluded_submission_ids:
logger.debug(f'Submission {submission.id} in exclusion list, skipping')
continue
else:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
def _download_submission(self, submission: praw.models.Submission):
if not isinstance(submission, praw.models.Submission):
@@ -354,13 +360,15 @@ class RedditDownloader:
res.download()
except errors.BulkDownloaderException:
logger.error(
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
f'Failed to download resource {res.url} with downloader {downloader_class.__name__}')
return
resource_hash = res.hash.hexdigest()
destination.parent.mkdir(parents=True, exist_ok=True)
if resource_hash in self.master_hash_list:
if self.args.no_dupes:
logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
logger.warning(
f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}'
' downloaded elsewhere')
return
elif self.args.make_hard_links:
self.master_hash_list[resource_hash].link_to(destination)
@@ -387,3 +395,16 @@ class RedditDownloader:
hash_list = {res[1]: res[0] for res in results}
return hash_list
def _read_excluded_ids(self) -> set[str]:
out = []
out.extend(self.args.exclude_id)
for id_file in self.args.exclude_id_file:
id_file = Path(id_file).resolve().expanduser()
if not id_file.exists():
logger.error(f'ID exclusion file at {id_file} does not exist')
continue
with open(id_file, 'r') as file:
for line in file:
out.append(line.strip())
return set(out)

View File

@@ -394,3 +394,29 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta
assert test_file_1_stats.st_nlink == 2
assert test_file_1_stats.st_ino == test_file_2_inode
@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), (
(('aaaaaa',), (), 1),
(('aaaaaa',), ('aaaaaa',), 0),
((), ('aaaaaa',), 0),
(('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1),
))
def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock):
downloader_mock.excluded_submission_ids = test_excluded
test_submissions = []
for test_id in test_ids:
m = MagicMock()
m.id = test_id
test_submissions.append(m)
downloader_mock.reddit_lists = [test_submissions]
RedditDownloader.download(downloader_mock)
assert downloader_mock._download_submission.call_count == expected_len
def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
test_file = tmp_path / 'test.txt'
test_file.write_text('aaaaaa\nbbbbbb')
downloader_mock.args.exclude_id_file = [test_file]
results = RedditDownloader._read_excluded_ids(downloader_mock)
assert results == {'aaaaaa', 'bbbbbb'}

View File

@@ -239,3 +239,18 @@ def test_cli_download_use_default_config(tmp_path: Path):
test_args = ['download', '-vv', str(tmp_path)]
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g', '--exclude-id', 'm2601g'],
))
def test_cli_download_links(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'in exclusion list' in result.output
assert 'Downloaded submission ' not in result.output