From d960bc0b7be1fc4732c48e1c1b890b4e25f1cd23 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 22 Apr 2021 10:38:32 +1000 Subject: [PATCH 01/56] Use ISO format for timestamps in names --- bdfr/file_name_formatter.py | 11 ++++++++--- tests/test_file_name_formatter.py | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index c4bf4b5..e1d42d7 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 - +import datetime import logging import platform import re @@ -74,10 +74,15 @@ class FileNameFormatter: 'postid': submission.id, 'upvotes': submission.score, 'flair': submission.link_flair_text, - 'date': submission.created_utc + 'date': FileNameFormatter._convert_timestamp(submission.created_utc), } return submission_attributes + @staticmethod + def _convert_timestamp(timestamp: float) -> str: + input_time = datetime.datetime.fromtimestamp(timestamp) + return input_time.isoformat() + @staticmethod def _generate_name_dict_from_comment(comment: Comment) -> dict: comment_attributes = { @@ -87,7 +92,7 @@ class FileNameFormatter: 'postid': comment.id, 'upvotes': comment.score, 'flair': '', - 'date': comment.created_utc, + 'date': FileNameFormatter._convert_timestamp(comment.created_utc), } return comment_attributes diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 7a91d8c..bcb38d7 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # coding=utf-8 +from datetime import datetime from pathlib import Path from typing import Optional from unittest.mock import MagicMock @@ -21,7 +22,7 @@ def submission() -> MagicMock: test.id = '12345' test.score = 1000 test.link_flair_text = 'test_flair' - test.created_utc = 123456789 + test.created_utc = datetime(2021, 4, 21, 9, 30, 0).timestamp() test.__class__ = praw.models.Submission return test @@ -37,7 +38,7 @@ def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: ('{POSTID}', '12345'), ('{UPVOTES}', '1000'), ('{FLAIR}', 'test_flair'), - ('{DATE}', '123456789'), + ('{DATE}', '2021-04-21T09:30:00'), ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'), ('{RANDOM}', '{RANDOM}'), )) @@ -318,3 +319,14 @@ def test_preserve_emojis(test_name_string: str, expected: str, submission: Magic def test_convert_unicode_escapes(test_string: str, expected: str): result = FileNameFormatter._convert_unicode_escapes(test_string) assert result == expected + + +@pytest.mark.parametrize(('test_datetime', 'expected'), ( + (datetime(2020, 1, 1, 8, 0, 0), '2020-01-01T08:00:00'), + (datetime(2020, 1, 1, 8, 0), '2020-01-01T08:00:00'), + (datetime(2021, 4, 21, 8, 30, 21), '2021-04-21T08:30:21'), +)) +def test_convert_timestamp(test_datetime: datetime, expected: str): + test_timestamp = test_datetime.timestamp() + result = FileNameFormatter._convert_timestamp(test_timestamp) + assert result == expected From 6767777944d34ff92548a4a873256ef3cd768982 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:17:06 +1000 Subject: [PATCH 02/56] Catch requests errors in site downloaders --- bdfr/site_downloaders/base_downloader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index ac45dc3..10787b8 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -8,7 +8,7 @@ from typing import Optional import requests from praw.models import Submission -from bdfr.exceptions import ResourceNotFound +from bdfr.exceptions import ResourceNotFound, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator @@ -27,7 +27,11 @@ class BaseDownloader(ABC): @staticmethod def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: - res = requests.get(url, cookies=cookies, headers=headers) + try: + res = requests.get(url, cookies=cookies, headers=headers) + except requests.exceptions.RequestException as e: + logger.exception(e) + raise SiteDownloaderError(f'Failed to get page {url}') if res.status_code != 200: raise ResourceNotFound(f'Server responded with {res.status_code} to {url}') return res From 386d5ea41c675569f15ab0100cb52dd9b4777e93 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:47:16 +1000 Subject: [PATCH 03/56] Format file --- tests/conftest.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ce4b681..da02948 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,11 @@ from bdfr.oauth2 import OAuth2TokenManager @pytest.fixture(scope='session') def reddit_instance(): - rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') + rd = praw.Reddit( + client_id='U-6gk4ZCh3IeNQ', + client_secret='7CZHY6AmKweZME5s50SfDGylaPg', + user_agent='test', + ) return rd @@ -27,8 +31,10 @@ def authenticated_reddit_instance(): if not cfg_parser.has_option('DEFAULT', 'user_token'): pytest.skip('Refresh token must be provided to authenticate with OAuth2') token_manager = OAuth2TokenManager(cfg_parser, test_config_path) - reddit_instance = praw.Reddit(client_id=cfg_parser.get('DEFAULT', 'client_id'), - client_secret=cfg_parser.get('DEFAULT', 'client_secret'), - user_agent=socket.gethostname(), - token_manager=token_manager) + reddit_instance = praw.Reddit( + client_id=cfg_parser.get('DEFAULT', 'client_id'), + client_secret=cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + token_manager=token_manager, + ) return reddit_instance From 214c883a100163f52326641fa2c53bede106d281 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:48:00 +1000 Subject: [PATCH 04/56] Simplify regex string slightly --- bdfr/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index c24b5cd..c109641 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -207,7 +207,7 @@ class RedditDownloader: @staticmethod def _sanitise_subreddit_name(subreddit: str) -> str: - pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)(?:/)?$') + pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$') match = re.match(pattern, subreddit) if not match: raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}') From ca495a66777b54daeb28f9aa43509a8c7a656500 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:54:21 +1000 Subject: [PATCH 05/56] Add missing typing declaration --- bdfr/downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index c109641..0df44d0 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -14,7 +14,7 @@ from datetime import datetime from enum import Enum, auto from multiprocessing import Pool from pathlib import Path -from typing import Iterator +from typing import Callable, Iterator import appdirs import praw @@ -265,7 +265,7 @@ class RedditDownloader: supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) return [supplied_submissions] - def _determine_sort_function(self): + def _determine_sort_function(self) -> Callable: if self.sort_filter is RedditTypes.SortType.NEW: sort_function = praw.models.Subreddit.new elif self.sort_filter is RedditTypes.SortType.RISING: From 7438543f491b2dbee41126c439c5171b924eeaec Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:54:34 +1000 Subject: [PATCH 06/56] Remove unused variable --- bdfr/downloader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 0df44d0..fce0631 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -225,7 +225,6 @@ class RedditDownloader: def _get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: out = [] - sort_function = self._determine_sort_function() for reddit in self._split_args_input(self.args.subreddit): try: reddit = self.reddit_instance.subreddit(reddit) From 8cdf926211af0751d57b812548f49d0ff2d363dd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:58:01 +1000 Subject: [PATCH 07/56] Rename function --- bdfr/site_downloaders/imgur.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 3458a45..33190ee 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -26,12 +26,12 @@ class Imgur(BaseDownloader): if 'album_images' in self.raw_data: images = self.raw_data['album_images'] for image in images['images']: - out.append(self._download_image(image)) + out.append(self._compute_image_url(image)) else: - out.append(self._download_image(self.raw_data)) + out.append(self._compute_image_url(self.raw_data)) return out - def _download_image(self, image: dict) -> Resource: + def _compute_image_url(self, image: dict) -> Resource: image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) return Resource(self.post, image_url) From f5d11107a75c6052757a62f8a6c257c6cba50700 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 21:06:16 +1000 Subject: [PATCH 08/56] Remove unused imports --- bdfr/site_downloaders/gif_delivery_network.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/redgifs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/gif_delivery_network.py b/bdfr/site_downloaders/gif_delivery_network.py index dbe2cf5..6efbb6f 100644 --- a/bdfr/site_downloaders/gif_delivery_network.py +++ b/bdfr/site_downloaders/gif_delivery_network.py @@ -6,7 +6,7 @@ import json from bs4 import BeautifulSoup from praw.models import Submission -from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 33190ee..6ae8a5e 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -7,7 +7,7 @@ from typing import Optional import bs4 from praw.models import Submission -from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 2436d33..3b59818 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -7,7 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork From 12c040d85da832e99f85a705d11fb0a13921cee5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 21:03:01 +1000 Subject: [PATCH 09/56] Fix reference paths --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .github/ISSUE_TEMPLATE/site-support-request.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 87364f4..ab4e1ab 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -9,7 +9,7 @@ assignees: '' - [ ] I am reporting a bug. - [ ] I am running the latest version of BDfR -- [ ] I have read the [Opening an issue](README.md#configuration) +- [ ] I have read the [Opening an issue](../../README.md#configuration) ## Description A clear and concise description of what the bug is. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index fbf7f6b..ce9f0b3 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -9,7 +9,7 @@ assignees: '' - [ ] I am requesting a feature. - [ ] I am running the latest version of BDfR -- [ ] I have read the [Opening an issue](README.md#configuration) +- [ ] I have read the [Opening an issue](../../README.md#configuration) ## Description Clearly state the current situation and issues you experience. Then, explain how this feature would solve these issues and make life easier. Also, explain the feature with as many detail as possible. diff --git a/.github/ISSUE_TEMPLATE/site-support-request.md b/.github/ISSUE_TEMPLATE/site-support-request.md index 8524bd8..fd400aa 100644 --- a/.github/ISSUE_TEMPLATE/site-support-request.md +++ b/.github/ISSUE_TEMPLATE/site-support-request.md @@ -9,7 +9,7 @@ assignees: '' - [ ] I am requesting a site support. - [ ] I am running the latest version of BDfR -- [ ] I have read the [Opening an issue](README.md#configuration) +- [ ] I have read the [Opening an issue](../../README.md#configuration) ## Site Provide a URL to domain of the site. From a28c2d3c732dcd408d9d4051484fc0b9b0c45b6f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 21:04:55 +1000 Subject: [PATCH 10/56] Add missing default argument --- bdfr/oauth2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 505d5bd..6b27599 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -81,7 +81,7 @@ class OAuth2Authenticator: return client @staticmethod - def send_message(client: socket.socket, message: str): + def send_message(client: socket.socket, message: str = ''): client.send(f'HTTP/1.1 200 OK\r\n\r\n{message}'.encode('utf-8')) client.close() From fbf8a2748ed7520025c0b76c992e19c9ab34be17 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 21:05:21 +1000 Subject: [PATCH 11/56] Fix formatting --- tests/test_integration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index 396025b..a384af6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,6 +11,7 @@ from bdfr.__main__ import cli does_test_config_exist = Path('test_config.cfg').exists() + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') From 3bc10ce1aa6ef7f8e8ebeea6cff8938dd7dd62f9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 21:05:49 +1000 Subject: [PATCH 12/56] Fix formatting --- tests/site_downloaders/test_vreddit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py index 3b663c2..ac83a9e 100644 --- a/tests/site_downloaders/test_vreddit.py +++ b/tests/site_downloaders/test_vreddit.py @@ -10,8 +10,8 @@ from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id'), ( - ('lu8l8g'), +@pytest.mark.parametrize('test_submission_id', ( + 'lu8l8g', )) def test_find_resources(test_submission_id: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From cb41d4749ad8cf3e9036919c8ea6a26c03eff0e5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Apr 2021 12:29:37 +1000 Subject: [PATCH 13/56] Add option to specify logfile location --- bdfr/__main__.py | 1 + bdfr/configuration.py | 1 + bdfr/downloader.py | 7 +++++- tests/test_integration.py | 51 +++++++++++++++++++++++++-------------- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 26759a1..4d78149 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -20,6 +20,7 @@ _common_options = [ click.option('-m', '--multireddit', multiple=True, default=None, type=str), click.option('-L', '--limit', default=None, type=int), click.option('--authenticate', is_flag=True, default=None), + click.option('--log', type=str, default=None), click.option('--submitted', is_flag=True, default=None), click.option('--upvoted', is_flag=True, default=None), click.option('--saved', is_flag=True, default=None), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 1d9610c..c5c7142 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -17,6 +17,7 @@ class Configuration(Namespace): self.exclude_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] + self.log: Optional[str] = None self.max_wait_time = None self.multireddit: list[str] = [] self.no_dupes: bool = False diff --git a/bdfr/downloader.py b/bdfr/downloader.py index fce0631..48c4234 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -190,7 +190,12 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() - log_path = Path(self.config_directory, 'log_output.txt') + if self.args.log is None: + log_path = Path(self.config_directory, 'log_output.txt') + else: + log_path = Path(self.args.log).resolve().expanduser() + if not log_path.parent.exists(): + raise errors.BulkDownloaderException(f'Designated location for logfile does not exist') backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3) file_handler = logging.handlers.RotatingFileHandler( log_path, diff --git a/tests/test_integration.py b/tests/test_integration.py index a384af6..327acc4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -12,6 +12,21 @@ from bdfr.__main__ import cli does_test_config_exist = Path('test_config.cfg').exists() +def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): + out = [ + 'download', str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args + return out + + +def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): + out = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + return out + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @@ -36,7 +51,7 @@ does_test_config_exist = Path('test_config.cfg').exists() )) def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Added submissions from subreddit ' in result.output @@ -54,7 +69,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): )) def test_cli_download_links(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 @@ -70,7 +85,7 @@ def test_cli_download_links(test_args: list[str], tmp_path: Path): )) def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Added submissions from multireddit ' in result.output @@ -84,7 +99,7 @@ def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): )) def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Failed to get submissions for multireddit' in result.output @@ -105,7 +120,7 @@ def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Pa )) def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Downloaded submission ' in result.output @@ -120,7 +135,7 @@ def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): )) def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'To use "me" as a user, an authenticated Reddit instance must be used' in result.output @@ -135,7 +150,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): Path(tmp_path, 'test.txt').touch() runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Calculating hashes for' in result.output @@ -149,7 +164,7 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): )) def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Download filter removed submission' in result.output @@ -164,7 +179,7 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): )) def test_cli_download_long(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 @@ -178,7 +193,7 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): )) def test_cli_archive_single(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert re.search(r'Writing entry .*? to file in .*? format', result.output) @@ -197,7 +212,7 @@ def test_cli_archive_single(test_args: list[str], tmp_path: Path): )) def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert re.search(r'Writing entry .*? to file in .*? format', result.output) @@ -211,7 +226,7 @@ def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): )) def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 @@ -226,7 +241,7 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): )) def test_cli_archive_long(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert re.search(r'Writing entry .*? to file in .*? format', result.output) @@ -243,7 +258,7 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): )) def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 @@ -258,7 +273,7 @@ def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): )) def test_cli_download_hard_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code != 0 @@ -278,7 +293,7 @@ def test_cli_download_use_default_config(tmp_path: Path): )) def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'in exclusion list' in result.output @@ -294,7 +309,7 @@ def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): )) def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'in skip list' in result.output @@ -310,7 +325,7 @@ def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): )) def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): runner = CliRunner() - test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Some files might not be downloaded due to name conflicts' in result.output From 667aa395e58829da22bb53d5e1830e27d2a89909 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Apr 2021 12:50:48 +1000 Subject: [PATCH 14/56] Update README --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 1f119cd..3414bec 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,9 @@ The following options are common between both the `archive` and `download` comma - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - See [Configuration Files](#configuration) for more details +- `--log` + - This allows one to specify the location of the logfile + - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below - `--saved` - This option will make the BDFR use the supplied user's saved posts list as a download source - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` @@ -240,6 +243,16 @@ To this end, the BDFR will sleep for a time before retrying the download, giving The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case. +## Multiple Instances + +The BDFR can be run in multiple instances with multiple configurations, either concurrently or consecutively. The use of scripting files facilitates this the easiest, either Powershell on Windows operating systems or Bash elsewhere. This allows multiple scenarios to be run with data being scraped from different sources, as any two sets of scenarios might be mutually exclusive i.e. it is not possible to download any combination of data from a single run of the BDFR. To download from multiple users for example, multiple runs of the BDFR are required. + +Running these scenarios consecutively is done easily, like any single run. Configuration files that differ may be specified with the `--config` option to switch between tokens, for example. Otherwise, almost all configuration for data sources can be specified per-run through the command line. + +Running scenarious concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes the program as Windows forbids multiple processes from accessing the same file. + +The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine. + ## List of currently supported sources - Direct links (links leading to a file) From db46676dec24133602bc1c32308c9e78947e834a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Apr 2021 12:54:30 +1000 Subject: [PATCH 15/56] Catch error when logfile accessed concurrently --- bdfr/downloader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 48c4234..5acb7de 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -203,7 +203,13 @@ class RedditDownloader: backupCount=backup_count, ) if log_path.exists(): - file_handler.doRollover() + try: + file_handler.doRollover() + except PermissionError as e: + logger.critical( + 'Cannot rollover logfile, make sure this is the only ' + 'BDFR process or specify alternate logfile location') + raise formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) From e6551bb797c74c41de36909e1b3aa2badb9ee52c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 23 Apr 2021 20:46:28 +1000 Subject: [PATCH 16/56] Return banned users as not existing --- bdfr/downloader.py | 2 +- tests/test_downloader.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 5acb7de..72cc019 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -344,7 +344,7 @@ class RedditDownloader: try: if not user.id: return False - except prawcore.exceptions.NotFound: + except (prawcore.exceptions.NotFound, AttributeError): return False return True diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 0d609ef..189f66a 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -458,3 +458,21 @@ def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_ downloader_mock.args.exclude_id_file = [test_file] results = RedditDownloader._read_excluded_ids(downloader_mock) assert results == {'aaaaaa', 'bbbbbb'} + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_redditor_name', 'expected'), ( + ('anthonyhui', True), # Real + ('lhnhfkuhwreolo', False), # Fake + ('Bree-Boo', False), # Banned +)) +def test_check_user_existence( + test_redditor_name: str, + expected: bool, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + result = RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + assert result == expected From 17499baf617850a04430dc613c0a737b44367299 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Apr 2021 15:10:29 +1000 Subject: [PATCH 17/56] Add informative error when testing user existence --- bdfr/downloader.py | 20 +++++++++++------- tests/test_downloader.py | 45 +++++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 72cc019..6b69f8c 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -314,8 +314,10 @@ class RedditDownloader: def _get_user_data(self) -> list[Iterator]: if any([self.args.submitted, self.args.upvoted, self.args.saved]): if self.args.user: - if not self._check_user_existence(self.args.user): - logger.error(f'User {self.args.user} does not exist') + try: + self._check_user_existence(self.args.user) + except errors.BulkDownloaderException as e: + logger.error(e) return [] generators = [] if self.args.submitted: @@ -339,14 +341,16 @@ class RedditDownloader: else: return [] - def _check_user_existence(self, name: str) -> bool: + def _check_user_existence(self, name: str): user = self.reddit_instance.redditor(name=name) try: - if not user.id: - return False - except (prawcore.exceptions.NotFound, AttributeError): - return False - return True + if user.id: + return + except prawcore.exceptions.NotFound: + raise errors.BulkDownloaderException(f'Could not find user {name}') + except AttributeError: + if hasattr(user, 'is_suspended'): + raise errors.BulkDownloaderException(f'User {name} is banned') def _create_file_name_formatter(self) -> FileNameFormatter: return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 189f66a..9ec13cf 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -462,17 +462,46 @@ def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_ @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_redditor_name', 'expected'), ( - ('anthonyhui', True), # Real - ('lhnhfkuhwreolo', False), # Fake - ('Bree-Boo', False), # Banned +@pytest.mark.parametrize('test_redditor_name', ( + 'Paracortex', + 'crowdstrike', + 'HannibalGoddamnit', )) -def test_check_user_existence( +def test_check_user_existence_good( + test_redditor_name: str, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_redditor_name', ( + 'lhnhfkuhwreolo', + 'adlkfmnhglojh', +)) +def test_check_user_existence_nonexistent( test_redditor_name: str, - expected: bool, reddit_instance: praw.Reddit, downloader_mock: MagicMock, ): downloader_mock.reddit_instance = reddit_instance - result = RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) - assert result == expected + with pytest.raises(BulkDownloaderException, match='Could not find'): + RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_redditor_name', ( + 'Bree-Boo', +)) +def test_check_user_existence_banned( + test_redditor_name: str, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + with pytest.raises(BulkDownloaderException, match='is banned'): + RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) From 6a205482696eecdf5bd65c50e6c957c1e12f9219 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 10:00:48 +1000 Subject: [PATCH 18/56] Catch additional error --- bdfr/site_downloaders/gif_delivery_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gif_delivery_network.py b/bdfr/site_downloaders/gif_delivery_network.py index 6efbb6f..1127301 100644 --- a/bdfr/site_downloaders/gif_delivery_network.py +++ b/bdfr/site_downloaders/gif_delivery_network.py @@ -30,7 +30,7 @@ class GifDeliveryNetwork(BaseDownloader): try: content = json.loads(content.string) out = content['video']['contentUrl'] - except (json.JSONDecodeError, KeyError, TypeError): + except (json.JSONDecodeError, KeyError, TypeError, AttributeError): raise SiteDownloaderError('Could not find source link') return out From 7fcbf623a0c09dda67e6aa7e3eba48090f4dac28 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 15:17:21 +1000 Subject: [PATCH 19/56] Catch additional errors in site downloaders --- bdfr/site_downloaders/gfycat.py | 2 +- bdfr/site_downloaders/redgifs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index f140660..f77f05b 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -34,7 +34,7 @@ class Gfycat(GifDeliveryNetwork): try: out = json.loads(content.contents[0])['video']['contentUrl'] - except (IndexError, KeyError) as e: + except (IndexError, KeyError, AttributeError) as e: raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}') except json.JSONDecodeError as e: raise SiteDownloaderError(f'Did not receive valid JSON data: {e}') diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 3b59818..14ab6ea 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -44,7 +44,7 @@ class Redgifs(GifDeliveryNetwork): try: out = json.loads(content.contents[0])['video']['contentUrl'] - except (IndexError, KeyError): + except (IndexError, KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') except json.JSONDecodeError as e: raise SiteDownloaderError(f'Received data was not valid JSON: {e}') From e1a4ac063c73af44507a219f78427afdd5328c9c Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Wed, 28 Apr 2021 11:30:26 +0300 Subject: [PATCH 20/56] (bug) redgifs: fix could not read page source --- bdfr/site_downloaders/redgifs.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 14ab6ea..4a478d0 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -27,23 +27,20 @@ class Redgifs(GifDeliveryNetwork): except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') - url = 'https://redgifs.com/watch/' + redgif_id + url = f'https://api.redgifs.com/v1/gfycats/{redgif_id}' headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ + 'Chrome/90.0.4430.93 Safari/537.36', } - page = Redgifs.retrieve_url(url, headers=headers) - - soup = BeautifulSoup(page.text, 'html.parser') - content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) + content = Redgifs.retrieve_url(url, headers=headers) if content is None: raise SiteDownloaderError('Could not read the page source') try: - out = json.loads(content.contents[0])['video']['contentUrl'] + out = content.json()["gfyItem"]["mp4Url"] except (IndexError, KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') except json.JSONDecodeError as e: From 3c6e9f6ccf0e66c1747f3698b87b272f8b2049c9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 18:37:49 +1000 Subject: [PATCH 21/56] Refactor class --- bdfr/site_downloaders/redgifs.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 4a478d0..cd429e2 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -27,21 +27,19 @@ class Redgifs(GifDeliveryNetwork): except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') - url = f'https://api.redgifs.com/v1/gfycats/{redgif_id}' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/90.0.4430.93 Safari/537.36', } - content = Redgifs.retrieve_url(url, headers=headers) + content = Redgifs.retrieve_url(f'https://api.redgifs.com/v1/gfycats/{redgif_id}', headers=headers) if content is None: raise SiteDownloaderError('Could not read the page source') try: - out = content.json()["gfyItem"]["mp4Url"] - except (IndexError, KeyError, AttributeError): + out = json.loads(content.text)['gfyItem']['mp4Url'] + except (KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') except json.JSONDecodeError as e: raise SiteDownloaderError(f'Received data was not valid JSON: {e}') From 760e59e1f79f1269249bdf16efce31f1b576cf18 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 18:50:18 +1000 Subject: [PATCH 22/56] Invert inheritance direction --- bdfr/site_downloaders/gfycat.py | 8 +++---- bdfr/site_downloaders/gif_delivery_network.py | 23 ++++--------------- bdfr/site_downloaders/redgifs.py | 7 +++--- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index f77f05b..eb33620 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -10,10 +10,10 @@ from praw.models import Submission from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.site_downloaders.redgifs import Redgifs -class Gfycat(GifDeliveryNetwork): +class Gfycat(Redgifs): def __init__(self, post: Submission): super().__init__(post) @@ -26,8 +26,8 @@ class Gfycat(GifDeliveryNetwork): url = 'https://gfycat.com/' + gfycat_id response = Gfycat.retrieve_url(url) - if 'gifdeliverynetwork' in response.url: - return GifDeliveryNetwork._get_link(url) + if re.search(r'(redgifs|gifdeliverynetwork)', response.url): + return Redgifs._get_link(url) soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) diff --git a/bdfr/site_downloaders/gif_delivery_network.py b/bdfr/site_downloaders/gif_delivery_network.py index 1127301..26cc1c5 100644 --- a/bdfr/site_downloaders/gif_delivery_network.py +++ b/bdfr/site_downloaders/gif_delivery_network.py @@ -1,36 +1,21 @@ #!/usr/bin/env python3 from typing import Optional -import json -from bs4 import BeautifulSoup from praw.models import Submission -from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.redgifs import Redgifs -class GifDeliveryNetwork(BaseDownloader): +class GifDeliveryNetwork(Redgifs): def __init__(self, post: Submission): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, '.mp4')] + return super(GifDeliveryNetwork, self).find_resources(authenticator) @staticmethod def _get_link(url: str) -> str: - page = GifDeliveryNetwork.retrieve_url(url) - - soup = BeautifulSoup(page.text, 'html.parser') - content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) - - try: - content = json.loads(content.string) - out = content['video']['contentUrl'] - except (json.JSONDecodeError, KeyError, TypeError, AttributeError): - raise SiteDownloaderError('Could not find source link') - - return out + return super(GifDeliveryNetwork, GifDeliveryNetwork)._get_link(url) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index cd429e2..051bc12 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -10,15 +10,16 @@ from praw.models import Submission from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.site_downloaders.base_downloader import BaseDownloader -class Redgifs(GifDeliveryNetwork): +class Redgifs(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return super().find_resources(authenticator) + media_url = self._get_link(self.post.url) + return [Resource(self.post, media_url, '.mp4')] @staticmethod def _get_link(url: str) -> str: From 9931839d1477b749f48e5e57d9ecaac15f77d60c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 18:53:24 +1000 Subject: [PATCH 23/56] Remove gifdeliverynetwork from download factory --- bdfr/site_downloaders/download_factory.py | 5 +---- tests/site_downloaders/test_download_factory.py | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 4bd6225..974e817 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -11,7 +11,6 @@ from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat -from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost @@ -33,11 +32,9 @@ class DownloadFactory: return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat - elif re.match(r'gifdeliverynetwork', sanitised_url): - return GifDeliveryNetwork elif re.match(r'(m\.)?imgur.*', sanitised_url): return Imgur - elif re.match(r'redgifs.com', sanitised_url): + elif re.match(r'(redgifs|gifdeliverynetwork)', sanitised_url): return Redgifs elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 65625b7..99299cb 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -11,7 +11,6 @@ from bdfr.site_downloaders.download_factory import DownloadFactory from bdfr.site_downloaders.erome import Erome from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat -from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost @@ -35,7 +34,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://www.erome.com/a/NWGw0F09', Erome), ('https://youtube.com/watch?v=Gv8Wz74FjVA', Youtube), ('https://redgifs.com/watch/courageousimpeccablecanvasback', Redgifs), - ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', GifDeliveryNetwork), + ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', Redgifs), ('https://youtu.be/DevfjHOhuFc', Youtube), ('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube), ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), From 39935c58d96d5a440356057c8db28f5eaebd2ef3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 18:58:33 +1000 Subject: [PATCH 24/56] Remove GifDeliveryNetwork module --- bdfr/site_downloaders/gif_delivery_network.py | 21 ----------- .../test_gif_delivery_network.py | 37 ------------------- tests/site_downloaders/test_redgifs.py | 6 +++ 3 files changed, 6 insertions(+), 58 deletions(-) delete mode 100644 bdfr/site_downloaders/gif_delivery_network.py delete mode 100644 tests/site_downloaders/test_gif_delivery_network.py diff --git a/bdfr/site_downloaders/gif_delivery_network.py b/bdfr/site_downloaders/gif_delivery_network.py deleted file mode 100644 index 26cc1c5..0000000 --- a/bdfr/site_downloaders/gif_delivery_network.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -from typing import Optional - -from praw.models import Submission - -from bdfr.resource import Resource -from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.redgifs import Redgifs - - -class GifDeliveryNetwork(Redgifs): - def __init__(self, post: Submission): - super().__init__(post) - - def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return super(GifDeliveryNetwork, self).find_resources(authenticator) - - @staticmethod - def _get_link(url: str) -> str: - return super(GifDeliveryNetwork, GifDeliveryNetwork)._get_link(url) diff --git a/tests/site_downloaders/test_gif_delivery_network.py b/tests/site_downloaders/test_gif_delivery_network.py deleted file mode 100644 index 38819c1..0000000 --- a/tests/site_downloaders/test_gif_delivery_network.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -from unittest.mock import Mock - -import pytest - -from bdfr.resource import Resource -from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork - - -@pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected'), ( - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', - 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', - 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), -)) -def test_get_link(test_url: str, expected: str): - result = GifDeliveryNetwork._get_link(test_url) - assert result == expected - - -@pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), -)) -def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock() - mock_submission.url = test_url - test_site = GifDeliveryNetwork(mock_submission) - resources = test_site.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download(120) - assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index a325025..71fc18e 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -15,6 +15,10 @@ from bdfr.site_downloaders.redgifs import Redgifs 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), ('https://redgifs.com/watch/springgreendecisivetaruca', 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), + ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', + 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), + ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', + 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), )) def test_get_link(test_url: str, expected: str): result = Redgifs._get_link(test_url) @@ -25,6 +29,8 @@ def test_get_link(test_url: str, expected: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), + ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), + ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock() From 2c54cd740a95f437d45b6ea6f2e61f9e57626f02 Mon Sep 17 00:00:00 2001 From: Daniel Clowry Date: Thu, 29 Apr 2021 19:05:16 +1000 Subject: [PATCH 25/56] Add Streamable downloader --- bdfr/site_downloaders/streamable.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 bdfr/site_downloaders/streamable.py diff --git a/bdfr/site_downloaders/streamable.py b/bdfr/site_downloaders/streamable.py new file mode 100644 index 0000000..a8c5efe --- /dev/null +++ b/bdfr/site_downloaders/streamable.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import logging +from typing import Optional + +from praw.models import Submission + +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.youtube import Youtube + +logger = logging.getLogger(__name__) + + +class Streamable(Youtube): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + out = super()._download_video({}) + return [out] From e6d2980db30aa052fe1542f6f57a49ca843fade1 Mon Sep 17 00:00:00 2001 From: Daniel Clowry Date: Thu, 29 Apr 2021 19:06:37 +1000 Subject: [PATCH 26/56] Add Streamable to download factory --- bdfr/site_downloaders/download_factory.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 974e817..ae7264c 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -16,6 +16,7 @@ from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube +from bdfr.site_downloaders.streamable import Streamable class DownloadFactory: @@ -42,10 +43,13 @@ class DownloadFactory: return VReddit elif re.match(r'(m\.)?youtu\.?be', sanitised_url): return Youtube + elif re.match(r'streamable\.com', sanitised_url): + return Streamable elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct else: - raise NotADownloadableLinkError(f'No downloader module exists for url {url}') + raise NotADownloadableLinkError( + f'No downloader module exists for url {url}') @staticmethod def _sanitise_url(url: str) -> str: From 600a85cbc8861934864a4d271cd5c31d751d1fe7 Mon Sep 17 00:00:00 2001 From: Daniel Clowry Date: Thu, 29 Apr 2021 19:11:27 +1000 Subject: [PATCH 27/56] Add Streamable tests --- tests/site_downloaders/test_streamable.py | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/site_downloaders/test_streamable.py diff --git a/tests/site_downloaders/test_streamable.py b/tests/site_downloaders/test_streamable.py new file mode 100644 index 0000000..ba24039 --- /dev/null +++ b/tests/site_downloaders/test_streamable.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import MagicMock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.streamable import Streamable + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), + ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf') +)) +def test_find_resources(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = Streamable(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download(120) + assert resources[0].hash.hexdigest() == expected_hash From 310cc123df91548138671d8fd37dafd7e42e7c3a Mon Sep 17 00:00:00 2001 From: Daniel Clowry Date: Thu, 29 Apr 2021 20:21:25 +1000 Subject: [PATCH 28/56] Add Streamable to download factory test --- tests/site_downloaders/test_download_factory.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 99299cb..db14f67 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -16,6 +16,7 @@ from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube +from bdfr.site_downloaders.streamable import Streamable @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( @@ -40,6 +41,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), + ('https://streamable.com/dt46y', Streamable) )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) From fe95394b3bb32c183886eb48e24e6375660b38d5 Mon Sep 17 00:00:00 2001 From: Daniel Clowry Date: Thu, 29 Apr 2021 21:38:44 +1000 Subject: [PATCH 29/56] Match import order, update docs --- README.md | 1 + bdfr/site_downloaders/download_factory.py | 2 +- tests/site_downloaders/test_download_factory.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3414bec..b451ae3 100644 --- a/README.md +++ b/README.md @@ -265,6 +265,7 @@ The way to fix this is to use the `--log` option to manually specify where the l - Reddit Videos - Redgifs - YouTube + - Streamable ## Contributing diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index ae7264c..8ee4d26 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -14,9 +14,9 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.streamable import Streamable from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube -from bdfr.site_downloaders.streamable import Streamable class DownloadFactory: diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index db14f67..aab8540 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -14,9 +14,9 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.streamable import Streamable from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube -from bdfr.site_downloaders.streamable import Streamable @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( From db0f90b4e3a8ec85058c5059904732caaee3fb97 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 1 May 2021 13:23:31 +1000 Subject: [PATCH 30/56] Add scripts to extract IDs --- scripts/extract_failed_ids.sh | 17 +++++++++++++++++ scripts/extract_successful_ids.sh | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100755 scripts/extract_failed_ids.sh create mode 100755 scripts/extract_successful_ids.sh diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh new file mode 100755 index 0000000..7e53785 --- /dev/null +++ b/scripts/extract_failed_ids.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -e "$1" ]; then + file="$1" +else + echo 'CANNOT FIND LOG FILE' + exit 1 +fi + +if [ -n "$2" ]; then + output="$2" + echo "Outputting IDs to $output" +else + output="failed.txt" +fi + +grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev >>"$output" diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh new file mode 100755 index 0000000..3b6f7bc --- /dev/null +++ b/scripts/extract_successful_ids.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -e "$1" ]; then + file="$1" +else + echo 'CANNOT FIND LOG FILE' + exit 1 +fi + +if [ -n "$2" ]; then + output="$2" + echo "Outputting IDs to $output" +else + output="successful.txt" +fi + +grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' >> "$output" From b3525490888b716b8718d1ddc77571f4f6bf3e3c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 1 May 2021 13:23:39 +1000 Subject: [PATCH 31/56] Add README for scripts --- scripts/README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..efaa816 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,44 @@ +# Useful Scripts + +Due to the verboseness of the logs, a great deal of information can be gathered quite easily from the BDFR's logfiles. In this folder, there is a selection of scripts that parse these logs, scraping useful bits of information. Since the logfiles are recurring patterns of strings, it is a fairly simple matter to write scripts that utilise tools included on most Linux systems. + +## Extract all Successfully Downloaded IDs + +This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--exclude-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. + +The script can be used with the following signature: + +```bash +./extract_successful_ids.sh LOGFILE_LOCATION +``` + +By default, if the second argument is not supplied, the script will write the results to `successful.txt`. + +An example of the script being run on a Linux machine is the following: + +```bash +./extract_successful_ids.sh ~/.config/bdfr/log_output.txt +``` + +## Extract all Failed IDs + +[This script](extract_failed_ids.sh) will output a file of all IDs that failed to be downloaded from the logfile in question. This may be used to prevent subsequent runs of the BDFR from re-attempting those submissions if that is desired, potentially increasing performance. +The script can be used with the following signature: + +```bash +./extract_failed_ids.sh LOGFILE_LOCATION +``` + +By default, if the second argument is not supplied, the script will write the results to `failed.txt`. + +An example of the script being run on a Linux machine is the following: + +```bash +./extract_failed_ids.sh ~/.config/bdfr/log_output.txt +``` + +## Converting BDFRv1 Timestamps to BDFRv2 Timestamps + +BDFRv2 uses an internationally recognised and standardised format for timestamps, namely ISO 8601. This is highly recommended due to the nature of using such a widespread and understood standard. However, the BDFRv1 does not use this standard. Due to this, if you've used the old timestamp in filenames or folders, the BDFR will no longer recognise them as the same file and potentially redownload duplicate resources. + +To prevent this, it is recommended that you rename existing files to ISO 8601 standard. This can be done using the [timestamp-converter](https://github.com/Serene-Arc/timestamp-converter) tool made for this purpose. Instructions specifically for the BDFR are available in that project. From 14195157de9ef66a8605dff774be10b922401b13 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 28 Apr 2021 12:43:11 +1000 Subject: [PATCH 32/56] Catch errors for banned or private subreddits --- bdfr/downloader.py | 14 ++++++++++++++ tests/test_downloader.py | 12 ++++++++++++ tests/test_integration.py | 2 ++ 3 files changed, 28 insertions(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 6b69f8c..9dbafc9 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -239,6 +239,11 @@ class RedditDownloader: for reddit in self._split_args_input(self.args.subreddit): try: reddit = self.reddit_instance.subreddit(reddit) + try: + self._check_subreddit_status(reddit) + except errors.BulkDownloaderException as e: + logger.error(e) + continue if self.args.search: out.append(reddit.search( self.args.search, @@ -460,3 +465,12 @@ class RedditDownloader: for line in file: out.append(line.strip()) return set(out) + + @staticmethod + def _check_subreddit_status(subreddit: praw.models.Subreddit): + try: + assert subreddit.id + except prawcore.NotFound: + raise errors.BulkDownloaderException(f'Source {subreddit.display_name} does not exist or cannot be found') + except prawcore.Forbidden: + raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 9ec13cf..ef06a77 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -505,3 +505,15 @@ def test_check_user_existence_banned( downloader_mock.reddit_instance = reddit_instance with pytest.raises(BulkDownloaderException, match='is banned'): RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddit_name', 'expected_message'), ( + ('donaldtrump', 'cannot be found'), + ('submitters', 'private and cannot be scraped') +)) +def test_check_subreddit_status(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): + test_subreddit = reddit_instance.subreddit(test_subreddit_name) + with pytest.raises(BulkDownloaderException, match=expected_message): + RedditDownloader._check_subreddit_status(test_subreddit) diff --git a/tests/test_integration.py b/tests/test_integration.py index 327acc4..003a465 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -255,6 +255,8 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): ['--user', 'sdclhgsolgjeroij', '--submitted', '-L', 10], ['--user', 'me', '--upvoted', '-L', 10], ['--user', 'sdclhgsolgjeroij', '--upvoted', '-L', 10], + ['--subreddit', 'submitters', '-L', 10], # Private subreddit + ['--subreddit', 'donaldtrump', '-L', 10], # Banned subreddit )) def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() From 711f8b0c7680507e2b8352377f6214170c14dc2f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 13:59:45 +1000 Subject: [PATCH 33/56] Add exception for r/all in subreddit check --- bdfr/downloader.py | 2 ++ tests/test_downloader.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 9dbafc9..3348628 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -468,6 +468,8 @@ class RedditDownloader: @staticmethod def _check_subreddit_status(subreddit: praw.models.Subreddit): + if subreddit.display_name == 'all': + return try: assert subreddit.id except prawcore.NotFound: diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ef06a77..0a3418e 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -513,7 +513,20 @@ def test_check_user_existence_banned( ('donaldtrump', 'cannot be found'), ('submitters', 'private and cannot be scraped') )) -def test_check_subreddit_status(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): +def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): test_subreddit = reddit_instance.subreddit(test_subreddit_name) with pytest.raises(BulkDownloaderException, match=expected_message): RedditDownloader._check_subreddit_status(test_subreddit) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_subreddit_name', ( + 'Python', + 'Mindustry', + 'TrollXChromosomes', + 'all', +)) +def test_check_subreddit_status_good(test_subreddit_name: str, reddit_instance: praw.Reddit): + test_subreddit = reddit_instance.subreddit(test_subreddit_name) + RedditDownloader._check_subreddit_status(test_subreddit) From eda12e527450c95fea8a8dad34b36025feb3d9a8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 3 May 2021 13:57:06 +1000 Subject: [PATCH 34/56] Make downloadfilter apply itself to Resources --- bdfr/download_filter.py | 15 ++++++++++++--- bdfr/downloader.py | 5 ++--- tests/test_download_filter.py | 22 ++++++++++++++-------- tests/test_integration.py | 3 ++- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 37a6ce9..3bbbdec 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -4,6 +4,8 @@ import logging import re +from bdfr.resource import Resource + logger = logging.getLogger(__name__) @@ -21,13 +23,20 @@ class DownloadFilter: else: return True - def _check_extension(self, url: str) -> bool: + def check_resource(self, res: Resource) -> bool: + if not self._check_extension(res.extension): + return False + elif not self._check_domain(res.url): + return False + return True + + def _check_extension(self, resource_extension: str) -> bool: if not self.excluded_extensions: return True combined_extensions = '|'.join(self.excluded_extensions) pattern = re.compile(r'.*({})$'.format(combined_extensions)) - if re.match(pattern, url): - logger.log(9, f'Url "{url}" matched with "{str(pattern)}"') + if re.match(pattern, resource_extension): + logger.log(9, f'Url "{resource_extension}" matched with "{str(pattern)}"') return False else: return True diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 3348628..f0b1977 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -394,9 +394,6 @@ class RedditDownloader: if not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return - if not self.download_filter.check_url(submission.url): - logger.debug(f'Download filter removed submission {submission.id} with URL {submission.url}') - return try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) @@ -413,6 +410,8 @@ class RedditDownloader: for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): logger.debug(f'File {destination} already exists, continuing') + elif not self.download_filter.check_resource(res): + logger.debug(f'Download filter removed {submission.id} with URL {submission.url}') else: try: res.download(self.args.max_wait_time) diff --git a/tests/test_download_filter.py b/tests/test_download_filter.py index 3c2adba..ead2b2f 100644 --- a/tests/test_download_filter.py +++ b/tests/test_download_filter.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3 # coding=utf-8 +from unittest.mock import MagicMock + import pytest from bdfr.download_filter import DownloadFilter +from bdfr.resource import Resource @pytest.fixture() @@ -11,13 +14,14 @@ def download_filter() -> DownloadFilter: return DownloadFilter(['mp4', 'mp3'], ['test.com', 'reddit.com']) -@pytest.mark.parametrize(('test_url', 'expected'), ( - ('test.mp4', False), - ('test.avi', True), - ('test.random.mp3', False), +@pytest.mark.parametrize(('test_extension', 'expected'), ( + ('.mp4', False), + ('.avi', True), + ('.random.mp3', False), + ('mp4', False), )) -def test_filter_extension(test_url: str, expected: bool, download_filter: DownloadFilter): - result = download_filter._check_extension(test_url) +def test_filter_extension(test_extension: str, expected: bool, download_filter: DownloadFilter): + result = download_filter._check_extension(test_extension) assert result == expected @@ -42,7 +46,8 @@ def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadF ('http://reddit.com/test.gif', False), )) def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): - result = download_filter.check_url(test_url) + test_resource = Resource(MagicMock(), test_url) + result = download_filter.check_resource(test_resource) assert result == expected @@ -54,5 +59,6 @@ def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilt )) def test_filter_empty_filter(test_url: str): download_filter = DownloadFilter() - result = download_filter.check_url(test_url) + test_resource = Resource(MagicMock(), test_url) + result = download_filter.check_resource(test_resource) assert result is True diff --git a/tests/test_integration.py b/tests/test_integration.py index 003a465..6345a7c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -161,13 +161,14 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], + ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'], )) def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'Download filter removed submission' in result.output + assert 'Download filter removed ' in result.output @pytest.mark.online From afa3e2548fab04f9befd060f351f89d26d144ed7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 13:56:39 +1000 Subject: [PATCH 35/56] Add customisable time formatting --- bdfr/__main__.py | 1 + bdfr/configuration.py | 1 + bdfr/default_config.cfg | 3 +- bdfr/downloader.py | 8 ++++- bdfr/file_name_formatter.py | 33 ++++++++++--------- tests/test_downloader.py | 1 + tests/test_file_name_formatter.py | 53 ++++++++++++++++++++----------- 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 4d78149..29a245c 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -25,6 +25,7 @@ _common_options = [ click.option('--upvoted', is_flag=True, default=None), click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), + click.option('--time-format', type=str, default=None), click.option('-u', '--user', type=str, default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', diff --git a/bdfr/configuration.py b/bdfr/configuration.py index c5c7142..9ab9d45 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -33,6 +33,7 @@ class Configuration(Namespace): self.submitted: bool = False self.subreddit: list[str] = [] self.time: str = 'all' + self.time_format = None self.upvoted: bool = False self.user: Optional[str] = None self.verbose: int = 0 diff --git a/bdfr/default_config.cfg b/bdfr/default_config.cfg index 1bcb02b..b8039a9 100644 --- a/bdfr/default_config.cfg +++ b/bdfr/default_config.cfg @@ -3,4 +3,5 @@ client_id = U-6gk4ZCh3IeNQ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg scopes = identity, history, read, save backup_log_count = 3 -max_wait_time = 120 \ No newline at end of file +max_wait_time = 120 +time_format = ISO \ No newline at end of file diff --git a/bdfr/downloader.py b/bdfr/downloader.py index f0b1977..b20fbf5 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -105,6 +105,12 @@ class RedditDownloader: logger.log(9, 'Wrote default download wait time download to config file') self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') + if self.args.time_format is None: + option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') + if re.match(r'^[ \'\"]*$', option): + option = 'ISO' + logger.debug(f'Setting datetime format string to {option}') + self.args.time_format = option # Update config on disk with open(self.config_location, 'w') as file: self.cfg_parser.write(file) @@ -358,7 +364,7 @@ class RedditDownloader: raise errors.BulkDownloaderException(f'User {name} is banned') def _create_file_name_formatter(self) -> FileNameFormatter: - return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme) + return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format) def _create_time_filter(self) -> RedditTypes.TimeType: try: diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index e1d42d7..c6c13c2 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -26,18 +26,18 @@ class FileNameFormatter: 'upvotes', ) - def __init__(self, file_format_string: str, directory_format_string: str): + def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str): if not self.validate_string(file_format_string): raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string self.directory_format_string: list[str] = directory_format_string.split('/') + self.time_format_string = time_format_string - @staticmethod - def _format_name(submission: (Comment, Submission), format_string: str) -> str: + def _format_name(self, submission: (Comment, Submission), format_string: str) -> str: if isinstance(submission, Submission): - attributes = FileNameFormatter._generate_name_dict_from_submission(submission) + attributes = self._generate_name_dict_from_submission(submission) elif isinstance(submission, Comment): - attributes = FileNameFormatter._generate_name_dict_from_comment(submission) + attributes = self._generate_name_dict_from_comment(submission) else: raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}') result = format_string @@ -65,8 +65,7 @@ class FileNameFormatter: in_string = in_string.replace(match, converted_match) return in_string - @staticmethod - def _generate_name_dict_from_submission(submission: Submission) -> dict: + def _generate_name_dict_from_submission(self, submission: Submission) -> dict: submission_attributes = { 'title': submission.title, 'subreddit': submission.subreddit.display_name, @@ -74,17 +73,18 @@ class FileNameFormatter: 'postid': submission.id, 'upvotes': submission.score, 'flair': submission.link_flair_text, - 'date': FileNameFormatter._convert_timestamp(submission.created_utc), + 'date': self._convert_timestamp(submission.created_utc), } return submission_attributes - @staticmethod - def _convert_timestamp(timestamp: float) -> str: + def _convert_timestamp(self, timestamp: float) -> str: input_time = datetime.datetime.fromtimestamp(timestamp) - return input_time.isoformat() + if self.time_format_string.upper().strip() == 'ISO': + return input_time.isoformat() + else: + return input_time.strftime(self.time_format_string) - @staticmethod - def _generate_name_dict_from_comment(comment: Comment) -> dict: + def _generate_name_dict_from_comment(self, comment: Comment) -> dict: comment_attributes = { 'title': comment.submission.title, 'subreddit': comment.subreddit.display_name, @@ -92,7 +92,7 @@ class FileNameFormatter: 'postid': comment.id, 'upvotes': comment.score, 'flair': '', - 'date': FileNameFormatter._convert_timestamp(comment.created_utc), + 'date': self._convert_timestamp(comment.created_utc), } return comment_attributes @@ -160,9 +160,8 @@ class FileNameFormatter: result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms]) if result: if 'POSTID' not in test_string: - logger.warning( - 'Some files might not be downloaded due to name conflicts as filenames are' - ' not guaranteed to be be unique without {POSTID}') + logger.warning('Some files might not be downloaded due to name conflicts as filenames are' + ' not guaranteed to be be unique without {POSTID}') return True else: return False diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 0a3418e..f1a20fc 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -22,6 +22,7 @@ from bdfr.site_authenticator import SiteAuthenticator @pytest.fixture() def args() -> Configuration: args = Configuration() + args.time_format = 'ISO' return args diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index bcb38d7..b4035dd 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -32,7 +32,7 @@ def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: return reddit_instance.submission(id='lgilgt') -@pytest.mark.parametrize(('format_string', 'expected'), ( +@pytest.mark.parametrize(('test_format_string', 'expected'), ( ('{SUBREDDIT}', 'randomreddit'), ('{REDDITOR}', 'person'), ('{POSTID}', '12345'), @@ -40,10 +40,10 @@ def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: ('{FLAIR}', 'test_flair'), ('{DATE}', '2021-04-21T09:30:00'), ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'), - ('{RANDOM}', '{RANDOM}'), )) -def test_format_name_mock(format_string: str, expected: str, submission: MagicMock): - result = FileNameFormatter._format_name(submission, format_string) +def test_format_name_mock(test_format_string: str, expected: str, submission: MagicMock): + test_formatter = FileNameFormatter(test_format_string, '', 'ISO') + result = test_formatter._format_name(submission, test_format_string) assert result == expected @@ -63,7 +63,7 @@ def test_check_format_string_validity(test_string: str, expected: bool): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('format_string', 'expected'), ( +@pytest.mark.parametrize(('test_format_string', 'expected'), ( ('{SUBREDDIT}', 'Mindustry'), ('{REDDITOR}', 'Gamer_player_boi'), ('{POSTID}', 'lgilgt'), @@ -71,8 +71,9 @@ def test_check_format_string_validity(test_string: str, expected: bool): ('{SUBREDDIT}_{TITLE}', 'Mindustry_Toxopid that is NOT humane >:('), ('{REDDITOR}_{TITLE}_{POSTID}', 'Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt') )) -def test_format_name_real(format_string: str, expected: str, reddit_submission: praw.models.Submission): - result = FileNameFormatter._format_name(reddit_submission, format_string) +def test_format_name_real(test_format_string: str, expected: str, reddit_submission: praw.models.Submission): + test_formatter = FileNameFormatter(test_format_string, '', '') + result = test_formatter._format_name(reddit_submission, test_format_string) assert result == expected @@ -101,7 +102,7 @@ def test_format_full( expected: str, reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') - test_formatter = FileNameFormatter(format_string_file, format_string_directory) + test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test')) assert str(result) == expected @@ -118,7 +119,7 @@ def test_format_full_conform( format_string_file: str, reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') - test_formatter = FileNameFormatter(format_string_file, format_string_directory) + test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter.format_path(test_resource, Path('test')) @@ -138,7 +139,7 @@ def test_format_full_with_index_suffix( reddit_submission: praw.models.Submission, ): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') - test_formatter = FileNameFormatter(format_string_file, format_string_directory) + test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test'), index) assert str(result) == expected @@ -152,7 +153,7 @@ def test_format_multiple_resources(): new_mock.source_submission.title = 'test' new_mock.source_submission.__class__ = praw.models.Submission mocks.append(new_mock) - test_formatter = FileNameFormatter('{TITLE}', '') + test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') results = test_formatter.format_resource_paths(mocks, Path('.')) results = set([str(res[0]) for res in results]) assert results == {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'} @@ -196,7 +197,7 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path): submission.subreddit.display_name = 'test' submission.id = 'BBBBBB' test_resource = Resource(submission, 'www.example.com/empty', '.jpeg') - test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') + test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO') result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) result.touch() @@ -237,7 +238,8 @@ def test_strip_emojies(test_string: str, expected: str): )) def test_generate_dict_for_submission(test_submission_id: str, expected: dict, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) - result = FileNameFormatter._generate_name_dict_from_submission(test_submission) + test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') + result = test_formatter._generate_name_dict_from_submission(test_submission) assert all([result.get(key) == expected[key] for key in expected.keys()]) @@ -253,7 +255,8 @@ def test_generate_dict_for_submission(test_submission_id: str, expected: dict, r )) def test_generate_dict_for_comment(test_comment_id: str, expected: dict, reddit_instance: praw.Reddit): test_comment = reddit_instance.comment(id=test_comment_id) - result = FileNameFormatter._generate_name_dict_from_comment(test_comment) + test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') + result = test_formatter._generate_name_dict_from_comment(test_comment) assert all([result.get(key) == expected[key] for key in expected.keys()]) @@ -272,7 +275,7 @@ def test_format_archive_entry_comment( reddit_instance: praw.Reddit, ): test_comment = reddit_instance.comment(id=test_comment_id) - test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme) + test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') test_entry = Resource(test_comment, '', '.json') result = test_formatter.format_path(test_entry, tmp_path) assert result.name == expected_name @@ -288,7 +291,7 @@ def test_multilevel_folder_scheme( tmp_path: Path, submission: MagicMock, ): - test_formatter = FileNameFormatter('{POSTID}', test_folder_scheme) + test_formatter = FileNameFormatter('{POSTID}', test_folder_scheme, 'ISO') test_resource = MagicMock() test_resource.source_submission = submission test_resource.extension = '.png' @@ -308,7 +311,8 @@ def test_multilevel_folder_scheme( )) def test_preserve_emojis(test_name_string: str, expected: str, submission: MagicMock): submission.title = test_name_string - result = FileNameFormatter._format_name(submission, '{TITLE}') + test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') + result = test_formatter._format_name(submission, '{TITLE}') assert result == expected @@ -328,5 +332,18 @@ def test_convert_unicode_escapes(test_string: str, expected: str): )) def test_convert_timestamp(test_datetime: datetime, expected: str): test_timestamp = test_datetime.timestamp() - result = FileNameFormatter._convert_timestamp(test_timestamp) + test_formatter = FileNameFormatter('{POSTID}', '', 'ISO') + result = test_formatter._convert_timestamp(test_timestamp) + assert result == expected + + +@pytest.mark.parametrize(('test_time_format', 'expected'), ( + ('ISO', '2021-05-02T13:33:00'), + ('%Y_%m', '2021_05'), + ('%Y-%m-%d', '2021-05-02'), +)) +def test_time_string_formats(test_time_format: str, expected: str): + test_time = datetime(2021, 5, 2, 13, 33) + test_formatter = FileNameFormatter('{TITLE}', '', test_time_format) + result = test_formatter._convert_timestamp(test_time.timestamp()) assert result == expected From a86a41e6a55daec927405372806994aad3f853c5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 3 May 2021 09:59:54 +1000 Subject: [PATCH 36/56] Update README --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b451ae3..1a02430 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,9 @@ The following options are common between both the `archive` and `download` comma - `week` - `month` - `year` + - `--time-format` + - This specifies the format of the datetime string that replaces `{DATE}` in file and folder naming schemes + - See [Time Formatting Customisation](#time-formatting-customisation) for more details, and the formatting scheme - `-u, --user` - This specifies the user to scrape in concert with other options - When using `--authenticate`, `--user me` can be used to refer to the authenticated user @@ -225,16 +228,26 @@ The logging output for each run of the BDFR will be saved to this directory in t The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied. - - `backup_log_count` - - `max_wait_time` - `client_id` - `client_secret` - `scopes` +The following keys are optional, and defaults will be used if they cannot be found. + + - `backup_log_count` + - `max_wait_time` + - `time_format` + All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number. +#### Time Formatting Customisation + +The option `time_format` will specify the format of the timestamp that replaces `{DATE}` in filename and folder name schemes. By default, this is the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format which is highly recommended due to its standardised nature. If you don't **need** to change it, it is recommended that you do not. However, you can specify it to anything required with this option. The `--time-format` option supersedes any specification in the configuration file + +The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library. + ### Rate Limiting The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases. From a8c213627077c3e7464daa6ad92e84659bc54e98 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 19:48:25 +1000 Subject: [PATCH 37/56] Add fallback downloader --- bdfr/site_downloaders/download_factory.py | 3 ++ .../fallback_downloaders/__init__.py | 0 .../fallback_downloader.py | 15 +++++++ .../youtubedl_fallback.py | 40 +++++++++++++++++++ .../fallback_downloaders/__init__.py | 0 .../youtubedl_fallback.py | 36 +++++++++++++++++ .../site_downloaders/test_download_factory.py | 6 ++- 7 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 bdfr/site_downloaders/fallback_downloaders/__init__.py create mode 100644 bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py create mode 100644 bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py create mode 100644 tests/site_downloaders/fallback_downloaders/__init__.py create mode 100644 tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 8ee4d26..157814f 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -9,6 +9,7 @@ from bdfr.exceptions import NotADownloadableLinkError from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome +from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur @@ -47,6 +48,8 @@ class DownloadFactory: return Streamable elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct + elif YoutubeDlFallback.can_handle_link(sanitised_url): + return YoutubeDlFallback else: raise NotADownloadableLinkError( f'No downloader module exists for url {url}') diff --git a/bdfr/site_downloaders/fallback_downloaders/__init__.py b/bdfr/site_downloaders/fallback_downloaders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py b/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py new file mode 100644 index 0000000..deeb213 --- /dev/null +++ b/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from abc import ABC, abstractmethod + +from bdfr.site_downloaders.base_downloader import BaseDownloader + + +class BaseFallbackDownloader(BaseDownloader, ABC): + + @staticmethod + @abstractmethod + def can_handle_link(url: str) -> bool: + """Returns whether the fallback downloader can download this link""" + raise NotImplementedError diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py new file mode 100644 index 0000000..6e006ec --- /dev/null +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging +from typing import Optional + +import youtube_dl +from praw.models import Submission + +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.fallback_downloaders.fallback_downloader import BaseFallbackDownloader +from bdfr.site_downloaders.youtube import Youtube + +logger = logging.getLogger(__name__) + + +class YoutubeDlFallback(BaseFallbackDownloader, Youtube): + def __init__(self, post: Submission): + super(YoutubeDlFallback, self).__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + out = super()._download_video({}) + return [out] + + @staticmethod + def can_handle_link(url: str) -> bool: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) + with youtube_dl.YoutubeDL({ + 'logger': yt_logger, + }) as ydl: + try: + result = ydl.extract_info(url, download=False) + if result: + return True + except youtube_dl.DownloadError as e: + logger.exception(e) + return False + return False diff --git a/tests/site_downloaders/fallback_downloaders/__init__.py b/tests/site_downloaders/fallback_downloaders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py new file mode 100644 index 0000000..7f393b6 --- /dev/null +++ b/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +from unittest.mock import MagicMock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', True), + ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), + ('https://www.example.com/test', False), +)) +def test_can_handle_link(test_url: str, expected: bool): + result = YoutubeDlFallback.can_handle_link(test_url) + assert result == expected + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), + ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), +)) +def test_find_resources(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = YoutubeDlFallback(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index aab8540..005974c 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -9,6 +9,7 @@ from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.download_factory import DownloadFactory from bdfr.site_downloaders.erome import Erome +from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur @@ -19,6 +20,7 @@ from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube +@pytest.mark.online @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( ('https://v.redd.it/9z1dnk3xr5k61', VReddit), ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life' @@ -41,7 +43,9 @@ from bdfr.site_downloaders.youtube import Youtube ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), - ('https://streamable.com/dt46y', Streamable) + ('https://streamable.com/dt46y', Streamable), + ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), + ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) From fba70dcf188795a0f22270aafb890065d71a50b8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 19:49:32 +1000 Subject: [PATCH 38/56] Intercept youtube-dl output --- bdfr/site_downloaders/youtube.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 7b62dc1..482d4bc 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -30,7 +30,10 @@ class Youtube(BaseDownloader): return [out] def _download_video(self, ytdl_options: dict) -> Resource: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) ytdl_options['quiet'] = True + ytdl_options['logger'] = yt_logger with tempfile.TemporaryDirectory() as temp_dir: download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' From ab96a3ba97ddfba4b1a4096610d5496a0e2611a8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 19:53:01 +1000 Subject: [PATCH 39/56] Remove Streamable downloader module --- bdfr/site_downloaders/download_factory.py | 3 --- bdfr/site_downloaders/streamable.py | 21 --------------- .../site_downloaders/test_download_factory.py | 3 +-- tests/site_downloaders/test_streamable.py | 26 ------------------- 4 files changed, 1 insertion(+), 52 deletions(-) delete mode 100644 bdfr/site_downloaders/streamable.py delete mode 100644 tests/site_downloaders/test_streamable.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 157814f..8324cd0 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -15,7 +15,6 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost -from bdfr.site_downloaders.streamable import Streamable from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube @@ -44,8 +43,6 @@ class DownloadFactory: return VReddit elif re.match(r'(m\.)?youtu\.?be', sanitised_url): return Youtube - elif re.match(r'streamable\.com', sanitised_url): - return Streamable elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct elif YoutubeDlFallback.can_handle_link(sanitised_url): diff --git a/bdfr/site_downloaders/streamable.py b/bdfr/site_downloaders/streamable.py deleted file mode 100644 index a8c5efe..0000000 --- a/bdfr/site_downloaders/streamable.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -import logging -from typing import Optional - -from praw.models import Submission - -from bdfr.resource import Resource -from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.youtube import Youtube - -logger = logging.getLogger(__name__) - - -class Streamable(Youtube): - def __init__(self, post: Submission): - super().__init__(post) - - def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = super()._download_video({}) - return [out] diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 005974c..36580e0 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -15,7 +15,6 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost -from bdfr.site_downloaders.streamable import Streamable from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube @@ -43,7 +42,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), - ('https://streamable.com/dt46y', Streamable), + ('https://streamable.com/dt46y', YoutubeDlFallback), ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), )) diff --git a/tests/site_downloaders/test_streamable.py b/tests/site_downloaders/test_streamable.py deleted file mode 100644 index ba24039..0000000 --- a/tests/site_downloaders/test_streamable.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -from unittest.mock import MagicMock - -import pytest - -from bdfr.resource import Resource -from bdfr.site_downloaders.streamable import Streamable - - -@pytest.mark.online -@pytest.mark.slow -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), - ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf') -)) -def test_find_resources(test_url: str, expected_hash: str): - test_submission = MagicMock() - test_submission.url = test_url - downloader = Streamable(test_submission) - resources = downloader.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download(120) - assert resources[0].hash.hexdigest() == expected_hash From c9cde54a723b18798ecf5bb800a5711c8674c58f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 2 May 2021 20:02:34 +1000 Subject: [PATCH 40/56] Remove VReddit downloader module --- bdfr/site_downloaders/download_factory.py | 3 --- bdfr/site_downloaders/vreddit.py | 21 ----------------- .../youtubedl_fallback.py | 1 + .../site_downloaders/test_download_factory.py | 3 +-- tests/site_downloaders/test_vreddit.py | 23 ------------------- 5 files changed, 2 insertions(+), 49 deletions(-) delete mode 100644 bdfr/site_downloaders/vreddit.py delete mode 100644 tests/site_downloaders/test_vreddit.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 8324cd0..7035dc2 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -15,7 +15,6 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost -from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube @@ -39,8 +38,6 @@ class DownloadFactory: return Redgifs elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost - elif re.match(r'v\.redd\.it', sanitised_url): - return VReddit elif re.match(r'(m\.)?youtu\.?be', sanitised_url): return Youtube elif re.match(r'i\.redd\.it.*', sanitised_url): diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py deleted file mode 100644 index bff96be..0000000 --- a/bdfr/site_downloaders/vreddit.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -import logging -from typing import Optional - -from praw.models import Submission - -from bdfr.resource import Resource -from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.youtube import Youtube - -logger = logging.getLogger(__name__) - - -class VReddit(Youtube): - def __init__(self, post: Submission): - super().__init__(post) - - def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = super()._download_video({}) - return [out] diff --git a/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 7f393b6..f70a91c 100644 --- a/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -25,6 +25,7 @@ def test_can_handle_link(test_url: str, expected: bool): ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), + ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 36580e0..f02e9f7 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -15,13 +15,11 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost -from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( - ('https://v.redd.it/9z1dnk3xr5k61', VReddit), ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life' '_in_anything_but_comfort/', SelfPost), ('https://i.imgur.com/bZx1SJQ.jpg', Direct), @@ -42,6 +40,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), + ('https://v.redd.it/9z1dnk3xr5k61', YoutubeDlFallback), ('https://streamable.com/dt46y', YoutubeDlFallback), ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py deleted file mode 100644 index ac83a9e..0000000 --- a/tests/site_downloaders/test_vreddit.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -import praw -import pytest - -from bdfr.resource import Resource -from bdfr.site_downloaders.vreddit import VReddit - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_submission_id', ( - 'lu8l8g', -)) -def test_find_resources(test_submission_id: str, reddit_instance: praw.Reddit): - test_submission = reddit_instance.submission(id=test_submission_id) - downloader = VReddit(test_submission) - resources = downloader.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download(120) - assert resources[0].content is not None From 66aef3eab93c03a96e3fad20256be1058118c945 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 4 May 2021 11:40:39 +1000 Subject: [PATCH 41/56] Add example summary script --- scripts/README.md | 25 +++++++++++++++++++++++++ scripts/print_summary.sh | 16 ++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100755 scripts/print_summary.sh diff --git a/scripts/README.md b/scripts/README.md index efaa816..4bb098b 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,6 +2,11 @@ Due to the verboseness of the logs, a great deal of information can be gathered quite easily from the BDFR's logfiles. In this folder, there is a selection of scripts that parse these logs, scraping useful bits of information. Since the logfiles are recurring patterns of strings, it is a fairly simple matter to write scripts that utilise tools included on most Linux systems. + - [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids) + - [Script to extract all failed download IDs](#extract-all-failed-ids) + - [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps) + - [Printing summary statistics for a run](#printing-summary-statistics) + ## Extract all Successfully Downloaded IDs This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--exclude-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. @@ -42,3 +47,23 @@ An example of the script being run on a Linux machine is the following: BDFRv2 uses an internationally recognised and standardised format for timestamps, namely ISO 8601. This is highly recommended due to the nature of using such a widespread and understood standard. However, the BDFRv1 does not use this standard. Due to this, if you've used the old timestamp in filenames or folders, the BDFR will no longer recognise them as the same file and potentially redownload duplicate resources. To prevent this, it is recommended that you rename existing files to ISO 8601 standard. This can be done using the [timestamp-converter](https://github.com/Serene-Arc/timestamp-converter) tool made for this purpose. Instructions specifically for the BDFR are available in that project. + +## Printing Summary Statistics + +A simple script has been included to print sumamry statistics for a run of the BDFR. This is mainly to showcase how easy it is to extract statistics from the logfiles. You can extend this quite easily. For example, you can print how often the Imgur module is used, or how many 404 errors there are in the last run, or which module has caused the most errors. The possibilities really are endless. + +```bash +./print_summary.sh LOGFILE_LOCATION +``` + +This will create an output like the following: + +``` +Downloaded submissions: 250 +Failed downloads: 103 +Files already downloaded: 20073 +Hard linked submissions: 30 +Excluded submissions: 1146 +Files with existing hash skipped: 0 +Submissions from excluded subreddits: 0 +``` diff --git a/scripts/print_summary.sh b/scripts/print_summary.sh new file mode 100755 index 0000000..052ef1e --- /dev/null +++ b/scripts/print_summary.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ -e "$1" ]; then + file="$1" +else + echo 'CANNOT FIND LOG FILE' + exit 1 +fi + +echo "Downloaded submissions: $( grep -c 'Downloaded submission' "$file" )" +echo "Failed downloads: $( grep -c 'failed to download submission' "$file" )" +echo "Files already downloaded: $( grep -c 'already exists, continuing' "$file" )" +echo "Hard linked submissions: $( grep -c 'Hard link made' "$file" )" +echo "Excluded submissions: $( grep -c 'in exclusion list' "$file" )" +echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' "$file" )" +echo "Submissions from excluded subreddits: $( grep -c 'in skip list' "$file" )" From 9752ef4b2a904e2aa11f8ea1a30cf82fe1751502 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 4 May 2021 15:45:16 +1000 Subject: [PATCH 42/56] Make filename test OS-agnostic --- tests/test_file_name_formatter.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index b4035dd..b1faf86 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path from typing import Optional from unittest.mock import MagicMock +import platform import praw.models import pytest @@ -27,6 +28,22 @@ def submission() -> MagicMock: return test +def do_test_string_equality(result: str, expected: str) -> bool: + if platform.system() == 'Windows': + expected = FileNameFormatter._format_for_windows(expected) + return expected == result + + +def do_test_path_equality(result: Path, expected: str) -> bool: + if platform.system() == 'Windows': + expected = expected.split('/') + expected = [FileNameFormatter._format_for_windows(part) for part in expected] + expected = Path(*expected) + else: + expected = Path(expected) + return result == expected + + @pytest.fixture(scope='session') def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: return reddit_instance.submission(id='lgilgt') @@ -44,7 +61,7 @@ def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: def test_format_name_mock(test_format_string: str, expected: str, submission: MagicMock): test_formatter = FileNameFormatter(test_format_string, '', 'ISO') result = test_formatter._format_name(submission, test_format_string) - assert result == expected + assert do_test_string_equality(result, expected) @pytest.mark.parametrize(('test_string', 'expected'), ( @@ -74,7 +91,7 @@ def test_check_format_string_validity(test_string: str, expected: bool): def test_format_name_real(test_format_string: str, expected: str, reddit_submission: praw.models.Submission): test_formatter = FileNameFormatter(test_format_string, '', '') result = test_formatter._format_name(reddit_submission, test_format_string) - assert result == expected + assert do_test_string_equality(result, expected) @pytest.mark.online @@ -104,7 +121,7 @@ def test_format_full( test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test')) - assert str(result) == expected + assert do_test_path_equality(result, expected) @pytest.mark.online @@ -141,7 +158,7 @@ def test_format_full_with_index_suffix( test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test'), index) - assert str(result) == expected + assert do_test_path_equality(result, expected) def test_format_multiple_resources(): @@ -278,7 +295,7 @@ def test_format_archive_entry_comment( test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') test_entry = Resource(test_comment, '', '.json') result = test_formatter.format_path(test_entry, tmp_path) - assert result.name == expected_name + assert do_test_string_equality(result.name, expected_name) @pytest.mark.parametrize(('test_folder_scheme', 'expected'), ( @@ -297,7 +314,7 @@ def test_multilevel_folder_scheme( test_resource.extension = '.png' result = test_formatter.format_path(test_resource, tmp_path) result = result.relative_to(tmp_path) - assert str(result.parent) == expected + assert do_test_path_equality(result.parent, expected) assert len(result.parents) == (len(expected.split('/')) + 1) @@ -313,7 +330,7 @@ def test_preserve_emojis(test_name_string: str, expected: str, submission: Magic submission.title = test_name_string test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') result = test_formatter._format_name(submission, '{TITLE}') - assert result == expected + assert do_test_string_equality(result, expected) @pytest.mark.parametrize(('test_string', 'expected'), ( From 00defe3b8709ce22946b59acbc51a8ede248fee7 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Wed, 5 May 2021 16:35:03 +0300 Subject: [PATCH 43/56] youtubedl_fallback: remove logging the expected exception --- .../site_downloaders/fallback_downloaders/youtubedl_fallback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 6e006ec..477f828 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -35,6 +35,6 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): if result: return True except youtube_dl.DownloadError as e: - logger.exception(e) + logger.debug("Submission cannot be downloaded using the youtube-dl fallback downloader") return False return False From e642ad68d4abe722f74fdc89ea8c4aca17a831c3 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Wed, 5 May 2021 16:56:34 +0300 Subject: [PATCH 44/56] youtubedl_fallback.py: add a fallback exception and log messages --- .../fallback_downloaders/youtubedl_fallback.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 477f828..8facda6 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -37,4 +37,8 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): except youtube_dl.DownloadError as e: logger.debug("Submission cannot be downloaded using the youtube-dl fallback downloader") return False + except Exception as e: + logger.error("Youtube-DL quitted unexpectedly.") + logger.exception(e) + return False return False From 77f9a7d52325745be15600bb9a0e3c7b7ac53017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Mon, 3 May 2021 16:23:09 +0300 Subject: [PATCH 45/56] Add ci to dev branch --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bb09493..bf3bfbb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,9 +2,9 @@ name: Python Test on: push: - branches: [ master ] + branches: [ master, development ] pull_request: - branches: [ master ] + branches: [ master, development ] jobs: test: From fc5f4a040576552b9347dec911bbb0a6ace94586 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 6 May 2021 09:59:26 +1000 Subject: [PATCH 46/56] Fix log access issue for archive integration tests --- tests/test_integration.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 6345a7c..5901802 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -23,7 +23,13 @@ def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): - out = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + out = [ + 'archive', + str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args return out From f6d89097f8ac5cfcf9c1908fe24352b56a1b0dd9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 6 May 2021 10:40:22 +1000 Subject: [PATCH 47/56] Consolidate exception block --- .../fallback_downloaders/youtubedl_fallback.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 8facda6..281182a 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -34,11 +34,7 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): result = ydl.extract_info(url, download=False) if result: return True - except youtube_dl.DownloadError as e: - logger.debug("Submission cannot be downloaded using the youtube-dl fallback downloader") - return False except Exception as e: - logger.error("Youtube-DL quitted unexpectedly.") logger.exception(e) return False return False From 283ad164e548ca49433378107c4bcfa4d990c621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Thu, 6 May 2021 12:52:45 +0300 Subject: [PATCH 48/56] __main__.py: fix typo in -f argument --- bdfr/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 29a245c..372c7c3 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -75,7 +75,7 @@ def cli_download(context: click.Context, **_): @cli.command('archive') @_add_common_options @click.option('--all-comments', is_flag=True, default=None) -@click.option('-f,', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) +@click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) @click.pass_context def cli_archive(context: click.Context, **_): config = Configuration() From 4ab1d6d6e03d4ba633d4151fa357490246f36ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Thu, 6 May 2021 13:04:41 +0300 Subject: [PATCH 49/56] test.yml: test on windows and macos (#290) --- .github/workflows/test.yml | 26 ++++++++++++++------------ devscripts/configure.ps1 | 2 ++ devscripts/configure.sh | 2 ++ 3 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 devscripts/configure.ps1 create mode 100755 devscripts/configure.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bf3bfbb..5aa8c61 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,15 +8,17 @@ on: jobs: test: - - runs-on: ubuntu-latest - + runs-on: ${{ matrix.os }} strategy: matrix: + os: [ubuntu-latest, macos-latest] python-version: [3.9] - + ext: [.sh] + include: + - os: windows-latest + python-version: 3.9 + ext: .ps1 steps: - - uses: actions/checkout@v2 - name: Setup Python uses: actions/setup-python@v2 @@ -26,19 +28,19 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip flake8 pytest pytest-cov - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -r requirements.txt - - name: Setup test configuration + - name: Make configuration for tests + env: + REDDIT_TOKEN: ${{ secrets.REDDIT_TEST_TOKEN }} run: | - cp bdfr/default_config.cfg ./test_config.cfg - echo -e "\nuser_token = ${{ secrets.REDDIT_TEST_TOKEN }}" >> ./test_config.cfg + ./devscripts/configure${{ matrix.ext }} - - name: Lint w/ flake8 + - name: Lint with flake8 run: | - # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Test w/ PyTest + - name: Test with pytest run: | pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html diff --git a/devscripts/configure.ps1 b/devscripts/configure.ps1 new file mode 100644 index 0000000..8ac0ce1 --- /dev/null +++ b/devscripts/configure.ps1 @@ -0,0 +1,2 @@ +copy .\\bdfr\\default_config.cfg .\\test_config.cfg +echo "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg \ No newline at end of file diff --git a/devscripts/configure.sh b/devscripts/configure.sh new file mode 100755 index 0000000..48e7c3e --- /dev/null +++ b/devscripts/configure.sh @@ -0,0 +1,2 @@ +cp ./bdfr/default_config.cfg ./test_config.cfg +echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg \ No newline at end of file From a2e22e894ad0f9e4ed3900007c9fba7f019aaea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Thu, 6 May 2021 16:11:48 +0300 Subject: [PATCH 50/56] Fix xml archiver encoding bug (#349) * test_integration: add archiver tests * archiver.py: fix encoding bug in xml archiver --- bdfr/archiver.py | 2 +- tests/test_integration.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index c6e4299..1945dfe 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -89,7 +89,7 @@ class Archiver(RedditDownloader): def _write_content_to_disk(self, resource: Resource, content: str): file_path = self.file_name_formatter.format_path(resource, self.download_directory) file_path.parent.mkdir(exist_ok=True, parents=True) - with open(file_path, 'w') as file: + with open(file_path, 'w', encoding="utf-8") as file: logger.debug( f'Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}' f' format at {file_path}') diff --git a/tests/test_integration.py b/tests/test_integration.py index 5901802..7aec0eb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -196,7 +196,8 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-l', 'gstd4hk'], - ['-l', 'm2601g'], + ['-l', 'm2601g', '-f', 'yaml'], + ['-l', 'n60t4c', '-f', 'xml'], )) def test_cli_archive_single(test_args: list[str], tmp_path: Path): runner = CliRunner() From db8b1c75471a91909cef031f10f3dbe5c5cc40cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Thu, 6 May 2021 17:16:10 +0300 Subject: [PATCH 51/56] setup.py: add minimum python requirement (#287) --- setup.cfg | 3 +-- setup.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3b57d7a..5e1a63d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.0.3 +version = 2.0.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc @@ -16,7 +16,6 @@ classifiers = Natural Language :: English Environment :: Console Operating System :: OS Independent -requires_python = >=3.9 platforms = any [files] diff --git a/setup.py b/setup.py index 40c6185..c5518a6 100644 --- a/setup.py +++ b/setup.py @@ -3,4 +3,4 @@ from setuptools import setup -setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bdfr/default_config.cfg'])]) +setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bdfr/default_config.cfg'])], python_requires='>=3.9.0') From 7e70175e4c4c3352d671e970fb3b53a7d2591f18 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 10 May 2021 18:58:42 +1000 Subject: [PATCH 52/56] Change logging message to include submission ID --- bdfr/downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index b20fbf5..1625c8f 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -422,8 +422,8 @@ class RedditDownloader: try: res.download(self.args.max_wait_time) except errors.BulkDownloaderException as e: - logger.error( - f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}') + logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' + f'with downloader {downloader_class.__name__}: {e}') return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) From a65bbd7fdab92384167abb90331b0c6c904fccf4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 10 May 2021 18:58:47 +1000 Subject: [PATCH 53/56] Update script --- scripts/extract_failed_ids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 7e53785..cdf1f21 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -15,3 +15,4 @@ else fi grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev >>"$output" +grep 'Failed to download resource' "$file" | awk '{ print $15 }' >>"$output" From edbd0e90a4f1940f2b06f90e30e8151fb596d1b9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 11 May 2021 09:43:06 +1000 Subject: [PATCH 54/56] Update config paths in README --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a02430..a02e436 100644 --- a/README.md +++ b/README.md @@ -214,13 +214,20 @@ It is highly recommended that the file name scheme contain the parameter `{POSTI ## Configuration The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: + - `C:\Users\\AppData\Local\BDFR\bdfr` +If Python has been installed through the Windows Store, the folder will appear in a different place. Note that the hash included in the file path may change from installation to installation. + + - `C:\Users\\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\Local\BDFR\bdfr` + On Mac OSX, this will be: + - `~/Library/Application Support/bdfr`. Lastly, on a Linux system, this will be: - - `~/.local/share/bdfr` + + - `~/.config/bdfr/` The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. From c9bd2e76c939a179e289d5321dac815d2681e043 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 11 May 2021 09:51:41 +1000 Subject: [PATCH 55/56] Update README with Arch packages --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a02e436..624cc5f 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,11 @@ If you wish to open an issue, please read [the guide on opening issues](docs/CON python3 -m pip install bdfr ``` +If on Arch Linux or derivative operating systems such as Manjaro, the BDFR can be installed through the AUR. + +- Latest Release: https://aur.archlinux.org/packages/python-bdfr/ +- Latest Development Build: https://aur.archlinux.org/packages/python-bdfr-git/ + If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development) ## Usage From ff36aeb85c4dcc7d193bf3e6e099d10c7ea1ae4d Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 11 May 2021 13:23:29 +0300 Subject: [PATCH 56/56] Bump the version to v2.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5e1a63d..1bba6b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.0.0 +version = 2.1.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc