From 4fc0d5dc1dd72c9444c00f73f8aaee30c6446adc Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:42:40 -0500 Subject: [PATCH 01/12] Add score filtering --- bdfr/downloader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 02f5c68..001a079 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -57,6 +57,12 @@ class RedditDownloader(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return + elif submission.score < self.args.min_score or self.args.max_score < submission.score: + logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < {self.args.min_score}") + return + elif submission.upvote_ratio < self.args.min_score_ratio or self.args.max_score_ratio < submission.upvote_ratio: + logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})") + return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return From 89653c4bad5559dda22317def8cd320d64f80e20 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:48:52 -0500 Subject: [PATCH 02/12] Update README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 82d8812..a539331 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,15 @@ The following options apply only to the `download` command. This command downloa - This skips all submissions from the specified subreddit - Can be specified multiple times - Also accepts CSV subreddit names +- `--min-score` + - This skips all submissions which have fewer than specified upvotes +- `--max-score` + - This skips all submissions which have more than specified upvotes +- `--min-score-ratio` + - This skips all submissions which have lower than specified upvote ratio +- `--max-score-ratio` + - This skips all submissions which have higher than specified upvote ratio + ### Archiver Options From 95454078966e626ac3fb3c25e769b51012cd1d0e Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:52:12 -0500 Subject: [PATCH 03/12] Update __main__.py --- bdfr/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 3b2472a..1117a70 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,6 +50,10 @@ _downloader_options = [ click.option('--skip', default=None, multiple=True), click.option('--skip-domain', default=None, multiple=True), click.option('--skip-subreddit', default=None, multiple=True), + click.option('--min-score', type=int, default=None), + click.option('--max-score', type=int, default=None), + click.option('--min-score-ratio', type=float, default=None), + click.option('--max-score-ratio', type=float, default=None), ] _archiver_options = [ From 7eb2ab6d7d70360fc4f6082fb7a69d49c1446a8b Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:53:58 -0500 Subject: [PATCH 04/12] Update configuration.py --- bdfr/configuration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index ddc1401..46c4cf0 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -38,6 +38,10 @@ class Configuration(Namespace): self.skip: list[str] = [] self.skip_domain: list[str] = [] self.skip_subreddit: list[str] = [] + self.min_score = None + self.max_score = None + self.min_score_ratio = None + self.max_score_ratio = None self.sort: str = 'hot' self.submitted: bool = False self.subscribed: bool = False From 5d76fcd5aa0141104cd4860800989bdc0281b635 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 23:35:44 -0500 Subject: [PATCH 05/12] Update downloader.py --- bdfr/downloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 001a079..adfadcb 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -57,10 +57,15 @@ class RedditDownloader(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return - elif submission.score < self.args.min_score or self.args.max_score < submission.score: - logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < {self.args.min_score}") + elif self.args.min_score and submission.score < self.args.min_score: + logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") return - elif submission.upvote_ratio < self.args.min_score_ratio or self.args.max_score_ratio < submission.upvote_ratio: + elif self.args.max_score and self.args.max_score < submission.score: + logger.debug(f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + return + elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( + self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio + ): logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})") return elif not isinstance(submission, praw.models.Submission): From f22a8aec4d589e77c868d99e30992a6502ce112d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 12:55:53 +1000 Subject: [PATCH 06/12] Fix line length --- bdfr/downloader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index adfadcb..83b5ebf 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -58,10 +58,12 @@ class RedditDownloader(RedditConnector): f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return elif self.args.min_score and submission.score < self.args.min_score: - logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") + logger.debug( + f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") return elif self.args.max_score and self.args.max_score < submission.score: - logger.debug(f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + logger.debug( + f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") return elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio From 2bbf1b644e4446502edce864ed300e9fbcb2b91b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:41:31 +1000 Subject: [PATCH 07/12] Change logging message --- bdfr/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 83b5ebf..3b5a7e1 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -63,7 +63,7 @@ class RedditDownloader(RedditConnector): return elif self.args.max_score and self.args.max_score < submission.score: logger.debug( - f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]") return elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio From 9d631257243927d0a00073d92bd80493e0bfde9c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:47:43 +1000 Subject: [PATCH 08/12] Add tests for downloader --- tests/test_downloader.py | 104 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index e5f0a31..e2e9e82 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -200,3 +200,107 @@ def test_download_submission( RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) assert len(folder_contents) == expected_files_len + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_score'), ( + ('ljyy27', 1), +)) +def test_download_submission_min_score_above( + test_submission_id: str, + min_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.min_score = min_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' not in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_score'), ( + ('ljyy27', 25), +)) +def test_download_submission_min_score_below( + test_submission_id: str, + min_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.min_score = min_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'max_score'), ( + ('ljyy27', 25), +)) +def test_download_submission_max_score_below( + test_submission_id: str, + max_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.max_score = max_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' not in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'max_score'), ( + ('ljyy27', 1), +)) +def test_download_submission_max_score_above( + test_submission_id: str, + max_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.max_score = max_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' in output.out From b47b90f2332d07ed1b06b74343dddcd04b1e9ab6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:59:35 +1000 Subject: [PATCH 09/12] Add integration tests --- .../test_download_integration.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 93d9392..15173b6 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -351,3 +351,19 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'Downloaded submission' not in result.output assert 'being an ignored user' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_args', 'was_filtered'), ( + (['-l', 'ljyy27', '--min-score', '50'], True), + (['-l', 'ljyy27', '--min-score', '1'], False), + (['-l', 'ljyy27', '--max-score', '1'], True), + (['-l', 'ljyy27', '--max-score', '100'], False), +)) +def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert ('filtered due to score' in result.output) == was_filtered From 55c95495b238bf13e699df42a06f84eae28735fa Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 14:49:45 +1000 Subject: [PATCH 10/12] Fix test structure --- scripts/tests/test_extract_failed_ids.bats | 15 ++++++++++----- scripts/tests/test_extract_successful_ids.bats | 15 ++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index a716cba..04eada6 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -13,31 +13,36 @@ teardown() { } @test "fail no downloader module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail resource error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail site downloader error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail failed file write" { - run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail disabled module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index caa8dd1..ddbd2ef 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -8,31 +8,36 @@ teardown() { } @test "success downloaded submission" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success resource hash" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success download filter" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success already exists" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success hard link" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } From 44e4c16b76d5fa0397eacf3d956a9909b22e5464 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 14:50:57 +1000 Subject: [PATCH 11/12] Update bash script --- scripts/extract_successful_ids.sh | 1 + scripts/tests/example_logfiles/succeed_score_filter.txt | 2 ++ scripts/tests/test_extract_successful_ids.bats | 7 +++++++ 3 files changed, 10 insertions(+) create mode 100644 scripts/tests/example_logfiles/succeed_score_filter.txt diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index e8f482e..f2128e5 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -13,4 +13,5 @@ fi grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; + grep 'filtered due to score' "$file" | awk '{ print $9 }' } diff --git a/scripts/tests/example_logfiles/succeed_score_filter.txt b/scripts/tests/example_logfiles/succeed_score_filter.txt new file mode 100644 index 0000000..8f31ef7 --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_score_filter.txt @@ -0,0 +1,2 @@ +[2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50] +[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1] \ No newline at end of file diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index ddbd2ef..6ff54bc 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -41,3 +41,10 @@ teardown() { assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } + +@test "success score filter" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_score_filter.txt + echo "$output" > successful.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "2" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} From 4b160c26118a7aff465a236bed8c8008a971dc2e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 15:06:49 +1000 Subject: [PATCH 12/12] Add missing flag --- tests/integration_tests/test_download_integration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 15173b6..a474172 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -355,6 +355,7 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize(('test_args', 'was_filtered'), ( (['-l', 'ljyy27', '--min-score', '50'], True), (['-l', 'ljyy27', '--min-score', '1'], False),