From fca31849502ed933e9abe70c555a58048c9623de Mon Sep 17 00:00:00 2001 From: BlipRanger <1860540+BlipRanger@users.noreply.github.com> Date: Wed, 12 May 2021 10:47:33 -0400 Subject: [PATCH 001/150] Bind socket to '0.0.0.0' rather than 'localhost' to allow for more flexible OAuth connection. (#368) --- bdfr/oauth2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 6b27599..bd60c9b 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -70,8 +70,8 @@ class OAuth2Authenticator: def receive_connection() -> socket.socket: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(('localhost', 7634)) - logger.log(9, 'Server listening on localhost:7634') + server.bind(('0.0.0.0', 7634)) + logger.log(9, 'Server listening on 0.0.0.0:7634') server.listen(1) client = server.accept()[0] From 8a7d21e159cbb70ee62c557d5a2fb2cebcb885a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sat, 15 May 2021 12:13:42 +0300 Subject: [PATCH 002/150] Create build artifact (#372) * Create build artifact * Update publish.yml --- .github/workflows/publish.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 1a679d1..6f15a00 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -27,3 +27,9 @@ jobs: run: | python setup.py sdist bdist_wheel twine upload dist/* + + - name: Upload coverage report + uses: actions/upload-artifact@v2 + with: + name: dist + path: dist/ From aea30d2b44f6099645a70341ba508f853c167e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sat, 15 May 2021 13:11:01 +0300 Subject: [PATCH 003/150] Check if REDDIT_TOKEN is set (#376) --- devscripts/configure.ps1 | 7 +++++-- devscripts/configure.sh | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/devscripts/configure.ps1 b/devscripts/configure.ps1 index 8ac0ce1..b096266 100644 --- a/devscripts/configure.ps1 +++ b/devscripts/configure.ps1 @@ -1,2 +1,5 @@ -copy .\\bdfr\\default_config.cfg .\\test_config.cfg -echo "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg \ No newline at end of file +if (-not ([string]::IsNullOrEmpty($env:REDDIT_TOKEN))) +{ + copy .\\bdfr\\default_config.cfg .\\test_config.cfg + echo "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg +} \ No newline at end of file diff --git a/devscripts/configure.sh b/devscripts/configure.sh index 48e7c3e..d9c96df 100755 --- a/devscripts/configure.sh +++ b/devscripts/configure.sh @@ -1,2 +1,5 @@ -cp ./bdfr/default_config.cfg ./test_config.cfg -echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg \ No newline at end of file +if [ ! -z "$REDDIT_TOKEN" ] +then + cp ./bdfr/default_config.cfg ./test_config.cfg + echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg +fi \ No newline at end of file From c7a5ec43768a64e6e9bfe8cf1438626363ab6426 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 15 May 2021 14:35:16 +0300 Subject: [PATCH 004/150] bug(youtube.dl): Fix crash on zero downloads #375 --- bdfr/site_downloaders/youtube.py | 9 +++++++-- tests/site_downloaders/test_youtube.py | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 482d4bc..e12fdc1 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -8,7 +8,7 @@ from typing import Optional import youtube_dl from praw.models import Submission -from bdfr.exceptions import SiteDownloaderError +from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError) from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader @@ -43,7 +43,12 @@ class Youtube(BaseDownloader): except youtube_dl.DownloadError as e: raise SiteDownloaderError(f'Youtube download failed: {e}') - downloaded_file = list(download_path.iterdir())[0] + downloaded_file = None + downloaded_files = list(download_path.iterdir()) + if len(downloaded_files) > 0: + downloaded_file = downloaded_files[0] + else: + raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") extension = downloaded_file.suffix with open(downloaded_file, 'rb') as file: content = file.read() diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 95bf1ea..90b451a 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -5,6 +5,7 @@ from unittest.mock import MagicMock import pytest +from bdfr.exceptions import NotADownloadableLinkError from bdfr.resource import Resource from bdfr.site_downloaders.youtube import Youtube @@ -15,7 +16,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), ('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'), )) -def test_find_resources(test_url: str, expected_hash: str): +def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() test_submission.url = test_url downloader = Youtube(test_submission) @@ -24,3 +25,16 @@ def test_find_resources(test_url: str, expected_hash: str): assert isinstance(resources[0], Resource) resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url'), ( + ('https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman-interview-oj-simpson-goliath-chronicles'), +)) +def test_find_resources_bad(test_url: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = Youtube(test_submission) + with pytest.raises(NotADownloadableLinkError): + downloader.find_resources() From 17226a4f0beafe96514bd7bf3dff7698ef77ff98 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 15 May 2021 14:37:47 +0300 Subject: [PATCH 005/150] refactor(youtube.dl): Remove slow parameter --- tests/site_downloaders/test_youtube.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 90b451a..986b0db 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -28,7 +28,6 @@ def test_find_resources_good(test_url: str, expected_hash: str): @pytest.mark.online -@pytest.mark.slow @pytest.mark.parametrize(('test_url'), ( ('https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman-interview-oj-simpson-goliath-chronicles'), )) From ef37712115fc02cb768388580d005402d7fe13ee Mon Sep 17 00:00:00 2001 From: alpbetgam <84060801+alpbetgam@users.noreply.github.com> Date: Sun, 16 May 2021 14:16:47 +1200 Subject: [PATCH 006/150] Fix error with old gfycat/redgifs urls --- bdfr/site_downloaders/gfycat.py | 1 + tests/site_downloaders/test_gfycat.py | 1 + 2 files changed, 2 insertions(+) diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index eb33620..6accaab 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -27,6 +27,7 @@ class Gfycat(Redgifs): response = Gfycat.retrieve_url(url) if re.search(r'(redgifs|gifdeliverynetwork)', response.url): + url = url.lower() # Fixes error with old gfycat/redgifs links return Redgifs._get_link(url) soup = BeautifulSoup(response.text, 'html.parser') diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 78c37a3..56aa2d0 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -14,6 +14,7 @@ from bdfr.site_downloaders.gfycat import Gfycat ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), ('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'), + ('https://gfycat.com/CornyLoathsomeHarrierhawk', 'https://thumbs2.redgifs.com/CornyLoathsomeHarrierhawk.mp4') )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) From 739f97edcce9097cade4f57931da2c3a5e1a41fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 16 May 2021 13:28:48 +0300 Subject: [PATCH 007/150] Bump the version to 2.1.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 1bba6b1..b1345d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.1.0 +version = 2.1.1 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From f768a7d61cb478e2d1b8e2e06abf2f1e0134ed6a Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sun, 16 May 2021 19:47:57 +0300 Subject: [PATCH 008/150] Rename --skip to --skip-format --- README.md | 2 +- bdfr/__main__.py | 2 +- bdfr/configuration.py | 2 +- bdfr/downloader.py | 2 +- tests/test_downloader.py | 2 +- tests/test_integration.py | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cf5269c..cd1c12b 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ The following options apply only to the `download` command. This command downloa - `--skip-domain` - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded - Can be specified multiple times -- `--skip` +- `--skip-format` - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded - Can be specified multiple times - `--skip-subreddit` diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 372c7c3..03a6e1d 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -53,7 +53,7 @@ def cli(): @click.option('--max-wait-time', type=int, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) -@click.option('--skip', default=None, multiple=True) +@click.option('--skip-format', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @click.option('--skip-subreddit', default=None, multiple=True) @_add_common_options diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 9ab9d45..8ca94a0 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -26,7 +26,7 @@ class Configuration(Namespace): self.search_existing: bool = False self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.folder_scheme: str = '{SUBREDDIT}' - self.skip: list[str] = [] + self.skip_format: list[str] = [] self.skip_domain: list[str] = [] self.skip_subreddit: list[str] = [] self.sort: str = 'hot' diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 1625c8f..663a9b0 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -379,7 +379,7 @@ class RedditDownloader: return RedditTypes.SortType.HOT def _create_download_filter(self) -> DownloadFilter: - return DownloadFilter(self.args.skip, self.args.skip_domain) + return DownloadFilter(self.args.skip_format, self.args.skip_domain) def _create_authenticator(self) -> SiteAuthenticator: return SiteAuthenticator(self.cfg_parser) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index f1a20fc..b6e8f32 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -56,7 +56,7 @@ def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): (['.test'], ['test.com'],), )) def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): - downloader_mock.args.skip = skip_extensions + downloader_mock.args.skip_format = skip_extensions downloader_mock.args.skip_domain = skip_domains result = RedditDownloader._create_download_filter(downloader_mock) diff --git a/tests/test_integration.py b/tests/test_integration.py index 7aec0eb..d52a527 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -166,8 +166,8 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], - ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'], + ['--subreddit', 'tumblr', '-L', '25', '--skip-format', 'png', '--skip-format', 'jpg'], + ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip-format', 'txt'], )) def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() From 200916a150dc7fdddaff9d89c3063c4cc4d84c78 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sun, 16 May 2021 19:51:31 +0300 Subject: [PATCH 009/150] Rename --exclude-id(-file) to --skip-id(-file) --- README.md | 4 ++-- bdfr/__main__.py | 4 ++-- bdfr/configuration.py | 4 ++-- bdfr/downloader.py | 4 ++-- scripts/README.md | 2 +- tests/test_downloader.py | 2 +- tests/test_integration.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index cd1c12b..cf7e589 100644 --- a/README.md +++ b/README.md @@ -131,10 +131,10 @@ The following options are common between both the `archive` and `download` comma The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. -- `--exclude-id` +- `--skip-id` - This will skip the download of any submission with the ID provided - Can be specified multiple times -- `--exclude-id-file` +- `--skip-id-file` - This will skip the download of any submission with any of the IDs in the files provided - Can be specified multiple times - Format is one ID per line diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 03a6e1d..bafa93c 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -45,14 +45,14 @@ def cli(): @cli.command('download') -@click.option('--exclude-id', default=None, multiple=True) -@click.option('--exclude-id-file', default=None, multiple=True) @click.option('--file-scheme', default=None, type=str) @click.option('--folder-scheme', default=None, type=str) @click.option('--make-hard-links', is_flag=True, default=None) @click.option('--max-wait-time', type=int, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) +@click.option('--skip-id', default=None, multiple=True) +@click.option('--skip-id-file', default=None, multiple=True) @click.option('--skip-format', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @click.option('--skip-subreddit', default=None, multiple=True) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 8ca94a0..8cb8f10 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -13,8 +13,8 @@ class Configuration(Namespace): self.authenticate = False self.config = None self.directory: str = '.' - self.exclude_id = [] - self.exclude_id_file = [] + self.skip_id = [] + self.skip_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.log: Optional[str] = None diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 663a9b0..6fa37d6 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -460,8 +460,8 @@ class RedditDownloader: def _read_excluded_ids(self) -> set[str]: out = [] - out.extend(self.args.exclude_id) - for id_file in self.args.exclude_id_file: + out.extend(self.args.skip_id) + for id_file in self.args.skip_id_file: id_file = Path(id_file).resolve().expanduser() if not id_file.exists(): logger.warning(f'ID exclusion file at {id_file} does not exist') diff --git a/scripts/README.md b/scripts/README.md index 4bb098b..51e51bb 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -9,7 +9,7 @@ Due to the verboseness of the logs, a great deal of information can be gathered ## Extract all Successfully Downloaded IDs -This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--exclude-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. +This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--skip-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. The script can be used with the following signature: diff --git a/tests/test_downloader.py b/tests/test_downloader.py index b6e8f32..fd56994 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -456,7 +456,7 @@ def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_ def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): test_file = tmp_path / 'test.txt' test_file.write_text('aaaaaa\nbbbbbb') - downloader_mock.args.exclude_id_file = [test_file] + downloader_mock.args.skip_id_file = [test_file] results = RedditDownloader._read_excluded_ids(downloader_mock) assert results == {'aaaaaa', 'bbbbbb'} diff --git a/tests/test_integration.py b/tests/test_integration.py index d52a527..419464f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -299,7 +299,7 @@ def test_cli_download_use_default_config(tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--exclude-id', 'm2601g'], + ['-l', 'm2601g', '--skip-id', 'm2601g'], )) def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() From 36886dffe046b0b633e11bdfbc656394b16f72df Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sun, 16 May 2021 19:56:26 +0300 Subject: [PATCH 010/150] Reorder skip options --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cf7e589..f89ef23 100644 --- a/README.md +++ b/README.md @@ -131,13 +131,6 @@ The following options are common between both the `archive` and `download` comma The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. -- `--skip-id` - - This will skip the download of any submission with the ID provided - - Can be specified multiple times -- `--skip-id-file` - - This will skip the download of any submission with any of the IDs in the files provided - - Can be specified multiple times - - Format is one ID per line - `--make-hard-links` - This flag will create hard links to an existing file when a duplicate is downloaded - This will make the file appear in multiple directories while only taking the space of a single instance @@ -158,6 +151,13 @@ The following options apply only to the `download` command. This command downloa - Sets the scheme for folders - Default is `{SUBREDDIT}` - See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details +- `--skip-id` + - This will skip the download of any submission with the ID provided + - Can be specified multiple times +- `--skip-id-file` + - This will skip the download of any submission with any of the IDs in the files provided + - Can be specified multiple times + - Format is one ID per line - `--skip-domain` - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded - Can be specified multiple times From 71da1556e535a46a33f9ade5fb90119c57a19ce5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 17 May 2021 11:08:16 +1000 Subject: [PATCH 011/150] Change out of date test case --- tests/site_downloaders/test_erome.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 1de9afd..1758eed 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -34,9 +34,6 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): ('https://www.erome.com/a/vqtPuLXh', { '5da2a8d60d87bed279431fdec8e7d72f' }), - ('https://www.erome.com/i/ItASD33e', { - 'b0d73fedc9ce6995c2f2c4fdb6f11eff' - }), ('https://www.erome.com/a/lGrcFxmb', { '0e98f9f527a911dcedde4f846bb5b69f', '25696ae364750a5303fc7d7dc78b35c1', @@ -44,6 +41,12 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): 'a1abf398cfd4ef9cfaf093ceb10c746a', 'bd9e1a4ea5ef0d6ba47fb90e337c2d14' }), + ('https://www.erome.com/a/IK5HADyi', { + '3b2a441ff821c09d9b629271a8b0f19f', + '470343fa67fd2ef9687c4223d278f761', + '7fbbc092939919aa74a710ddd26adc02', + 'c7299a73e019ab635b47c863fe3cd473', + }) )) def test_download_resource(test_url: str, expected_hashes: tuple[str]): # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash From 70166037631004e74924aa96397504a647ad3944 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 17 May 2021 10:56:44 +1000 Subject: [PATCH 012/150] Refactor out super class RedditConnector --- bdfr/archiver.py | 15 +- bdfr/connector.py | 401 ++++++++++++++++++++++++++++ bdfr/downloader.py | 385 +-------------------------- tests/test_connector.py | 401 ++++++++++++++++++++++++++++ tests/test_downloader.py | 554 +++++++-------------------------------- 5 files changed, 905 insertions(+), 851 deletions(-) create mode 100644 bdfr/connector.py create mode 100644 tests/test_connector.py diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 1945dfe..3e0b907 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -15,13 +15,14 @@ from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry from bdfr.configuration import Configuration from bdfr.downloader import RedditDownloader +from bdfr.connector import RedditConnector from bdfr.exceptions import ArchiverError from bdfr.resource import Resource logger = logging.getLogger(__name__) -class Archiver(RedditDownloader): +class Archiver(RedditConnector): def __init__(self, args: Configuration): super(Archiver, self).__init__(args) @@ -29,9 +30,9 @@ class Archiver(RedditDownloader): for generator in self.reddit_lists: for submission in generator: logger.debug(f'Attempting to archive submission {submission.id}') - self._write_entry(submission) + self.write_entry(submission) - def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: + def get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] for sub_id in self.args.link: if len(sub_id) == 6: @@ -42,10 +43,10 @@ class Archiver(RedditDownloader): supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) return [supplied_submissions] - def _get_user_data(self) -> list[Iterator]: - results = super(Archiver, self)._get_user_data() + def get_user_data(self) -> list[Iterator]: + results = super(Archiver, self).get_user_data() if self.args.user and self.args.all_comments: - sort = self._determine_sort_function() + sort = self.determine_sort_function() logger.debug(f'Retrieving comments of user {self.args.user}') results.append(sort(self.reddit_instance.redditor(self.args.user).comments, limit=self.args.limit)) return results @@ -59,7 +60,7 @@ class Archiver(RedditDownloader): else: raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') - def _write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): + def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): archive_entry = self._pull_lever_entry_factory(praw_item) if self.args.format == 'json': self._write_entry_json(archive_entry) diff --git a/bdfr/connector.py b/bdfr/connector.py new file mode 100644 index 0000000..3dcc118 --- /dev/null +++ b/bdfr/connector.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import configparser +import importlib.resources +import logging +import logging.handlers +import re +import shutil +import socket +from abc import ABCMeta, abstractmethod +from datetime import datetime +from enum import Enum, auto +from pathlib import Path +from typing import Callable, Iterator + +import appdirs +import praw +import praw.exceptions +import praw.models +import prawcore + +from bdfr import exceptions as errors +from bdfr.configuration import Configuration +from bdfr.download_filter import DownloadFilter +from bdfr.file_name_formatter import FileNameFormatter +from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager +from bdfr.site_authenticator import SiteAuthenticator + +logger = logging.getLogger(__name__) + + +class RedditTypes: + class SortType(Enum): + CONTROVERSIAL = auto() + HOT = auto() + NEW = auto() + RELEVENCE = auto() + RISING = auto() + TOP = auto() + + class TimeType(Enum): + ALL = 'all' + DAY = 'day' + HOUR = 'hour' + MONTH = 'month' + WEEK = 'week' + YEAR = 'year' + + +class RedditConnector(metaclass=ABCMeta): + def __init__(self, args: Configuration): + self.args = args + self.config_directories = appdirs.AppDirs('bdfr', 'BDFR') + self.run_time = datetime.now().isoformat() + self._setup_internal_objects() + + self.reddit_lists = self.retrieve_reddit_lists() + + def _setup_internal_objects(self): + self.determine_directories() + self.load_config() + self.create_file_logger() + + self.read_config() + + self.download_filter = self.create_download_filter() + logger.log(9, 'Created download filter') + self.time_filter = self.create_time_filter() + logger.log(9, 'Created time filter') + self.sort_filter = self.create_sort_filter() + logger.log(9, 'Created sort filter') + self.file_name_formatter = self.create_file_name_formatter() + logger.log(9, 'Create file name formatter') + + self.create_reddit_instance() + self.resolve_user_name() + + self.excluded_submission_ids = self.read_excluded_ids() + + self.master_hash_list = {} + self.authenticator = self.create_authenticator() + logger.log(9, 'Created site authenticator') + + self.args.skip_subreddit = self.split_args_input(self.args.skip_subreddit) + self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit]) + + def read_config(self): + """Read any cfg values that need to be processed""" + if self.args.max_wait_time is None: + if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): + self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') + logger.log(9, 'Wrote default download wait time download to config file') + self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') + logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') + if self.args.time_format is None: + option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') + if re.match(r'^[ \'\"]*$', option): + option = 'ISO' + logger.debug(f'Setting datetime format string to {option}') + self.args.time_format = option + # Update config on disk + with open(self.config_location, 'w') as file: + self.cfg_parser.write(file) + + def create_reddit_instance(self): + if self.args.authenticate: + logger.debug('Using authenticated Reddit instance') + if not self.cfg_parser.has_option('DEFAULT', 'user_token'): + logger.log(9, 'Commencing OAuth2 authentication') + scopes = self.cfg_parser.get('DEFAULT', 'scopes') + scopes = OAuth2Authenticator.split_scopes(scopes) + oauth2_authenticator = OAuth2Authenticator( + scopes, + self.cfg_parser.get('DEFAULT', 'client_id'), + self.cfg_parser.get('DEFAULT', 'client_secret'), + ) + token = oauth2_authenticator.retrieve_new_token() + self.cfg_parser['DEFAULT']['user_token'] = token + with open(self.config_location, 'w') as file: + self.cfg_parser.write(file, True) + token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) + + self.authenticated = True + self.reddit_instance = praw.Reddit( + client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + token_manager=token_manager, + ) + else: + logger.debug('Using unauthenticated Reddit instance') + self.authenticated = False + self.reddit_instance = praw.Reddit( + client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + ) + + def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: + master_list = [] + master_list.extend(self.get_subreddits()) + logger.log(9, 'Retrieved subreddits') + master_list.extend(self.get_multireddits()) + logger.log(9, 'Retrieved multireddits') + master_list.extend(self.get_user_data()) + logger.log(9, 'Retrieved user data') + master_list.extend(self.get_submissions_from_link()) + logger.log(9, 'Retrieved submissions for given links') + return master_list + + def determine_directories(self): + self.download_directory = Path(self.args.directory).resolve().expanduser() + self.config_directory = Path(self.config_directories.user_config_dir) + + self.download_directory.mkdir(exist_ok=True, parents=True) + self.config_directory.mkdir(exist_ok=True, parents=True) + + def load_config(self): + self.cfg_parser = configparser.ConfigParser() + if self.args.config: + if (cfg_path := Path(self.args.config)).exists(): + self.cfg_parser.read(cfg_path) + self.config_location = cfg_path + return + possible_paths = [ + Path('./config.cfg'), + Path('./default_config.cfg'), + Path(self.config_directory, 'config.cfg'), + Path(self.config_directory, 'default_config.cfg'), + ] + self.config_location = None + for path in possible_paths: + if path.resolve().expanduser().exists(): + self.config_location = path + logger.debug(f'Loading configuration from {path}') + break + if not self.config_location: + self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] + shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) + if not self.config_location: + raise errors.BulkDownloaderException('Could not find a configuration file to load') + self.cfg_parser.read(self.config_location) + + def create_file_logger(self): + main_logger = logging.getLogger() + if self.args.log is None: + log_path = Path(self.config_directory, 'log_output.txt') + else: + log_path = Path(self.args.log).resolve().expanduser() + if not log_path.parent.exists(): + raise errors.BulkDownloaderException(f'Designated location for logfile does not exist') + backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3) + file_handler = logging.handlers.RotatingFileHandler( + log_path, + mode='a', + backupCount=backup_count, + ) + if log_path.exists(): + try: + file_handler.doRollover() + except PermissionError as e: + logger.critical( + 'Cannot rollover logfile, make sure this is the only ' + 'BDFR process or specify alternate logfile location') + raise + formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') + file_handler.setFormatter(formatter) + file_handler.setLevel(0) + + main_logger.addHandler(file_handler) + + @staticmethod + def sanitise_subreddit_name(subreddit: str) -> str: + pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$') + match = re.match(pattern, subreddit) + if not match: + raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}') + return match.group(1) + + @staticmethod + def split_args_input(entries: list[str]) -> set[str]: + all_entries = [] + split_pattern = re.compile(r'[,;]\s?') + for entry in entries: + results = re.split(split_pattern, entry) + all_entries.extend([RedditConnector.sanitise_subreddit_name(name) for name in results]) + return set(all_entries) + + def get_subreddits(self) -> list[praw.models.ListingGenerator]: + if self.args.subreddit: + out = [] + for reddit in self.split_args_input(self.args.subreddit): + try: + reddit = self.reddit_instance.subreddit(reddit) + try: + self.check_subreddit_status(reddit) + except errors.BulkDownloaderException as e: + logger.error(e) + continue + if self.args.search: + out.append(reddit.search( + self.args.search, + sort=self.sort_filter.name.lower(), + limit=self.args.limit, + time_filter=self.time_filter.value, + )) + logger.debug( + f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"') + else: + out.append(self.create_filtered_listing_generator(reddit)) + logger.debug(f'Added submissions from subreddit {reddit}') + except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: + logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') + return out + else: + return [] + + def resolve_user_name(self): + if self.args.user == 'me': + if self.authenticated: + self.args.user = self.reddit_instance.user.me().name + logger.log(9, f'Resolved user to {self.args.user}') + else: + self.args.user = None + logger.warning('To use "me" as a user, an authenticated Reddit instance must be used') + + def get_submissions_from_link(self) -> list[list[praw.models.Submission]]: + supplied_submissions = [] + for sub_id in self.args.link: + if len(sub_id) == 6: + supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) + else: + supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) + return [supplied_submissions] + + def determine_sort_function(self) -> Callable: + if self.sort_filter is RedditTypes.SortType.NEW: + sort_function = praw.models.Subreddit.new + elif self.sort_filter is RedditTypes.SortType.RISING: + sort_function = praw.models.Subreddit.rising + elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: + sort_function = praw.models.Subreddit.controversial + elif self.sort_filter is RedditTypes.SortType.TOP: + sort_function = praw.models.Subreddit.top + else: + sort_function = praw.models.Subreddit.hot + return sort_function + + def get_multireddits(self) -> list[Iterator]: + if self.args.multireddit: + out = [] + for multi in self.split_args_input(self.args.multireddit): + try: + multi = self.reddit_instance.multireddit(self.args.user, multi) + if not multi.subreddits: + raise errors.BulkDownloaderException + out.append(self.create_filtered_listing_generator(multi)) + logger.debug(f'Added submissions from multireddit {multi}') + except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e: + logger.error(f'Failed to get submissions for multireddit {multi}: {e}') + return out + else: + return [] + + def create_filtered_listing_generator(self, reddit_source) -> Iterator: + sort_function = self.determine_sort_function() + if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL): + return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value) + else: + return sort_function(reddit_source, limit=self.args.limit) + + def get_user_data(self) -> list[Iterator]: + if any([self.args.submitted, self.args.upvoted, self.args.saved]): + if self.args.user: + try: + self.check_user_existence(self.args.user) + except errors.BulkDownloaderException as e: + logger.error(e) + return [] + generators = [] + if self.args.submitted: + logger.debug(f'Retrieving submitted posts of user {self.args.user}') + generators.append(self.create_filtered_listing_generator( + self.reddit_instance.redditor(self.args.user).submissions, + )) + if not self.authenticated and any((self.args.upvoted, self.args.saved)): + logger.warning('Accessing user lists requires authentication') + else: + if self.args.upvoted: + logger.debug(f'Retrieving upvoted posts of user {self.args.user}') + generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit)) + if self.args.saved: + logger.debug(f'Retrieving saved posts of user {self.args.user}') + generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) + return generators + else: + logger.warning('A user must be supplied to download user data') + return [] + else: + return [] + + def check_user_existence(self, name: str): + user = self.reddit_instance.redditor(name=name) + try: + if user.id: + return + except prawcore.exceptions.NotFound: + raise errors.BulkDownloaderException(f'Could not find user {name}') + except AttributeError: + if hasattr(user, 'is_suspended'): + raise errors.BulkDownloaderException(f'User {name} is banned') + + def create_file_name_formatter(self) -> FileNameFormatter: + return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format) + + def create_time_filter(self) -> RedditTypes.TimeType: + try: + return RedditTypes.TimeType[self.args.time.upper()] + except (KeyError, AttributeError): + return RedditTypes.TimeType.ALL + + def create_sort_filter(self) -> RedditTypes.SortType: + try: + return RedditTypes.SortType[self.args.sort.upper()] + except (KeyError, AttributeError): + return RedditTypes.SortType.HOT + + def create_download_filter(self) -> DownloadFilter: + return DownloadFilter(self.args.skip_format, self.args.skip_domain) + + def create_authenticator(self) -> SiteAuthenticator: + return SiteAuthenticator(self.cfg_parser) + + @abstractmethod + def download(self): + pass + + @staticmethod + def check_subreddit_status(subreddit: praw.models.Subreddit): + if subreddit.display_name == 'all': + return + try: + assert subreddit.id + except prawcore.NotFound: + raise errors.BulkDownloaderException(f'Source {subreddit.display_name} does not exist or cannot be found') + except prawcore.Forbidden: + raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') + + def read_excluded_ids(self) -> set[str]: + out = [] + out.extend(self.args.skip_id) + for id_file in self.args.skip_id_file: + id_file = Path(id_file).resolve().expanduser() + if not id_file.exists(): + logger.warning(f'ID exclusion file at {id_file} does not exist') + continue + with open(id_file, 'r') as file: + for line in file: + out.append(line.strip()) + return set(out) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 6fa37d6..62934a8 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -1,33 +1,19 @@ #!/usr/bin/env python3 # coding=utf-8 -import configparser import hashlib -import importlib.resources -import logging import logging.handlers import os -import re -import shutil -import socket -from datetime import datetime -from enum import Enum, auto from multiprocessing import Pool from pathlib import Path -from typing import Callable, Iterator -import appdirs import praw import praw.exceptions import praw.models -import prawcore -import bdfr.exceptions as errors +from bdfr import exceptions as errors from bdfr.configuration import Configuration -from bdfr.download_filter import DownloadFilter -from bdfr.file_name_formatter import FileNameFormatter -from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager -from bdfr.site_authenticator import SiteAuthenticator +from bdfr.connector import RedditConnector from bdfr.site_downloaders.download_factory import DownloadFactory logger = logging.getLogger(__name__) @@ -39,350 +25,11 @@ def _calc_hash(existing_file: Path): return existing_file, file_hash -class RedditTypes: - class SortType(Enum): - CONTROVERSIAL = auto() - HOT = auto() - NEW = auto() - RELEVENCE = auto() - RISING = auto() - TOP = auto() - - class TimeType(Enum): - ALL = 'all' - DAY = 'day' - HOUR = 'hour' - MONTH = 'month' - WEEK = 'week' - YEAR = 'year' - - -class RedditDownloader: +class RedditDownloader(RedditConnector): def __init__(self, args: Configuration): - self.args = args - self.config_directories = appdirs.AppDirs('bdfr', 'BDFR') - self.run_time = datetime.now().isoformat() - self._setup_internal_objects() - - self.reddit_lists = self._retrieve_reddit_lists() - - def _setup_internal_objects(self): - self._determine_directories() - self._load_config() - self._create_file_logger() - - self._read_config() - - self.download_filter = self._create_download_filter() - logger.log(9, 'Created download filter') - self.time_filter = self._create_time_filter() - logger.log(9, 'Created time filter') - self.sort_filter = self._create_sort_filter() - logger.log(9, 'Created sort filter') - self.file_name_formatter = self._create_file_name_formatter() - logger.log(9, 'Create file name formatter') - - self._create_reddit_instance() - self._resolve_user_name() - - self.excluded_submission_ids = self._read_excluded_ids() - + super(RedditDownloader, self).__init__(args) if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) - else: - self.master_hash_list = {} - self.authenticator = self._create_authenticator() - logger.log(9, 'Created site authenticator') - - self.args.skip_subreddit = self._split_args_input(self.args.skip_subreddit) - self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit]) - - def _read_config(self): - """Read any cfg values that need to be processed""" - if self.args.max_wait_time is None: - if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): - self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') - logger.log(9, 'Wrote default download wait time download to config file') - self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') - logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') - if self.args.time_format is None: - option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') - if re.match(r'^[ \'\"]*$', option): - option = 'ISO' - logger.debug(f'Setting datetime format string to {option}') - self.args.time_format = option - # Update config on disk - with open(self.config_location, 'w') as file: - self.cfg_parser.write(file) - - def _create_reddit_instance(self): - if self.args.authenticate: - logger.debug('Using authenticated Reddit instance') - if not self.cfg_parser.has_option('DEFAULT', 'user_token'): - logger.log(9, 'Commencing OAuth2 authentication') - scopes = self.cfg_parser.get('DEFAULT', 'scopes') - scopes = OAuth2Authenticator.split_scopes(scopes) - oauth2_authenticator = OAuth2Authenticator( - scopes, - self.cfg_parser.get('DEFAULT', 'client_id'), - self.cfg_parser.get('DEFAULT', 'client_secret'), - ) - token = oauth2_authenticator.retrieve_new_token() - self.cfg_parser['DEFAULT']['user_token'] = token - with open(self.config_location, 'w') as file: - self.cfg_parser.write(file, True) - token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) - - self.authenticated = True - self.reddit_instance = praw.Reddit( - client_id=self.cfg_parser.get('DEFAULT', 'client_id'), - client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), - user_agent=socket.gethostname(), - token_manager=token_manager, - ) - else: - logger.debug('Using unauthenticated Reddit instance') - self.authenticated = False - self.reddit_instance = praw.Reddit( - client_id=self.cfg_parser.get('DEFAULT', 'client_id'), - client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), - user_agent=socket.gethostname(), - ) - - def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: - master_list = [] - master_list.extend(self._get_subreddits()) - logger.log(9, 'Retrieved subreddits') - master_list.extend(self._get_multireddits()) - logger.log(9, 'Retrieved multireddits') - master_list.extend(self._get_user_data()) - logger.log(9, 'Retrieved user data') - master_list.extend(self._get_submissions_from_link()) - logger.log(9, 'Retrieved submissions for given links') - return master_list - - def _determine_directories(self): - self.download_directory = Path(self.args.directory).resolve().expanduser() - self.config_directory = Path(self.config_directories.user_config_dir) - - self.download_directory.mkdir(exist_ok=True, parents=True) - self.config_directory.mkdir(exist_ok=True, parents=True) - - def _load_config(self): - self.cfg_parser = configparser.ConfigParser() - if self.args.config: - if (cfg_path := Path(self.args.config)).exists(): - self.cfg_parser.read(cfg_path) - self.config_location = cfg_path - return - possible_paths = [ - Path('./config.cfg'), - Path('./default_config.cfg'), - Path(self.config_directory, 'config.cfg'), - Path(self.config_directory, 'default_config.cfg'), - ] - self.config_location = None - for path in possible_paths: - if path.resolve().expanduser().exists(): - self.config_location = path - logger.debug(f'Loading configuration from {path}') - break - if not self.config_location: - self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] - shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) - if not self.config_location: - raise errors.BulkDownloaderException('Could not find a configuration file to load') - self.cfg_parser.read(self.config_location) - - def _create_file_logger(self): - main_logger = logging.getLogger() - if self.args.log is None: - log_path = Path(self.config_directory, 'log_output.txt') - else: - log_path = Path(self.args.log).resolve().expanduser() - if not log_path.parent.exists(): - raise errors.BulkDownloaderException(f'Designated location for logfile does not exist') - backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3) - file_handler = logging.handlers.RotatingFileHandler( - log_path, - mode='a', - backupCount=backup_count, - ) - if log_path.exists(): - try: - file_handler.doRollover() - except PermissionError as e: - logger.critical( - 'Cannot rollover logfile, make sure this is the only ' - 'BDFR process or specify alternate logfile location') - raise - formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') - file_handler.setFormatter(formatter) - file_handler.setLevel(0) - - main_logger.addHandler(file_handler) - - @staticmethod - def _sanitise_subreddit_name(subreddit: str) -> str: - pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$') - match = re.match(pattern, subreddit) - if not match: - raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}') - return match.group(1) - - @staticmethod - def _split_args_input(entries: list[str]) -> set[str]: - all_entries = [] - split_pattern = re.compile(r'[,;]\s?') - for entry in entries: - results = re.split(split_pattern, entry) - all_entries.extend([RedditDownloader._sanitise_subreddit_name(name) for name in results]) - return set(all_entries) - - def _get_subreddits(self) -> list[praw.models.ListingGenerator]: - if self.args.subreddit: - out = [] - for reddit in self._split_args_input(self.args.subreddit): - try: - reddit = self.reddit_instance.subreddit(reddit) - try: - self._check_subreddit_status(reddit) - except errors.BulkDownloaderException as e: - logger.error(e) - continue - if self.args.search: - out.append(reddit.search( - self.args.search, - sort=self.sort_filter.name.lower(), - limit=self.args.limit, - time_filter=self.time_filter.value, - )) - logger.debug( - f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"') - else: - out.append(self._create_filtered_listing_generator(reddit)) - logger.debug(f'Added submissions from subreddit {reddit}') - except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: - logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') - return out - else: - return [] - - def _resolve_user_name(self): - if self.args.user == 'me': - if self.authenticated: - self.args.user = self.reddit_instance.user.me().name - logger.log(9, f'Resolved user to {self.args.user}') - else: - self.args.user = None - logger.warning('To use "me" as a user, an authenticated Reddit instance must be used') - - def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: - supplied_submissions = [] - for sub_id in self.args.link: - if len(sub_id) == 6: - supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) - else: - supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) - return [supplied_submissions] - - def _determine_sort_function(self) -> Callable: - if self.sort_filter is RedditTypes.SortType.NEW: - sort_function = praw.models.Subreddit.new - elif self.sort_filter is RedditTypes.SortType.RISING: - sort_function = praw.models.Subreddit.rising - elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: - sort_function = praw.models.Subreddit.controversial - elif self.sort_filter is RedditTypes.SortType.TOP: - sort_function = praw.models.Subreddit.top - else: - sort_function = praw.models.Subreddit.hot - return sort_function - - def _get_multireddits(self) -> list[Iterator]: - if self.args.multireddit: - out = [] - for multi in self._split_args_input(self.args.multireddit): - try: - multi = self.reddit_instance.multireddit(self.args.user, multi) - if not multi.subreddits: - raise errors.BulkDownloaderException - out.append(self._create_filtered_listing_generator(multi)) - logger.debug(f'Added submissions from multireddit {multi}') - except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e: - logger.error(f'Failed to get submissions for multireddit {multi}: {e}') - return out - else: - return [] - - def _create_filtered_listing_generator(self, reddit_source) -> Iterator: - sort_function = self._determine_sort_function() - if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL): - return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value) - else: - return sort_function(reddit_source, limit=self.args.limit) - - def _get_user_data(self) -> list[Iterator]: - if any([self.args.submitted, self.args.upvoted, self.args.saved]): - if self.args.user: - try: - self._check_user_existence(self.args.user) - except errors.BulkDownloaderException as e: - logger.error(e) - return [] - generators = [] - if self.args.submitted: - logger.debug(f'Retrieving submitted posts of user {self.args.user}') - generators.append(self._create_filtered_listing_generator( - self.reddit_instance.redditor(self.args.user).submissions, - )) - if not self.authenticated and any((self.args.upvoted, self.args.saved)): - logger.warning('Accessing user lists requires authentication') - else: - if self.args.upvoted: - logger.debug(f'Retrieving upvoted posts of user {self.args.user}') - generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit)) - if self.args.saved: - logger.debug(f'Retrieving saved posts of user {self.args.user}') - generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) - return generators - else: - logger.warning('A user must be supplied to download user data') - return [] - else: - return [] - - def _check_user_existence(self, name: str): - user = self.reddit_instance.redditor(name=name) - try: - if user.id: - return - except prawcore.exceptions.NotFound: - raise errors.BulkDownloaderException(f'Could not find user {name}') - except AttributeError: - if hasattr(user, 'is_suspended'): - raise errors.BulkDownloaderException(f'User {name} is banned') - - def _create_file_name_formatter(self) -> FileNameFormatter: - return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format) - - def _create_time_filter(self) -> RedditTypes.TimeType: - try: - return RedditTypes.TimeType[self.args.time.upper()] - except (KeyError, AttributeError): - return RedditTypes.TimeType.ALL - - def _create_sort_filter(self) -> RedditTypes.SortType: - try: - return RedditTypes.SortType[self.args.sort.upper()] - except (KeyError, AttributeError): - return RedditTypes.SortType.HOT - - def _create_download_filter(self) -> DownloadFilter: - return DownloadFilter(self.args.skip_format, self.args.skip_domain) - - def _create_authenticator(self) -> SiteAuthenticator: - return SiteAuthenticator(self.cfg_parser) def download(self): for generator in self.reddit_lists: @@ -457,27 +104,3 @@ class RedditDownloader: hash_list = {res[1]: res[0] for res in results} return hash_list - - def _read_excluded_ids(self) -> set[str]: - out = [] - out.extend(self.args.skip_id) - for id_file in self.args.skip_id_file: - id_file = Path(id_file).resolve().expanduser() - if not id_file.exists(): - logger.warning(f'ID exclusion file at {id_file} does not exist') - continue - with open(id_file, 'r') as file: - for line in file: - out.append(line.strip()) - return set(out) - - @staticmethod - def _check_subreddit_status(subreddit: praw.models.Subreddit): - if subreddit.display_name == 'all': - return - try: - assert subreddit.id - except prawcore.NotFound: - raise errors.BulkDownloaderException(f'Source {subreddit.display_name} does not exist or cannot be found') - except prawcore.Forbidden: - raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') diff --git a/tests/test_connector.py b/tests/test_connector.py new file mode 100644 index 0000000..41d9115 --- /dev/null +++ b/tests/test_connector.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from pathlib import Path +from typing import Iterator +from unittest.mock import MagicMock + +import praw +import praw.models +import pytest + +from bdfr.configuration import Configuration +from bdfr.connector import RedditConnector, RedditTypes +from bdfr.download_filter import DownloadFilter +from bdfr.exceptions import BulkDownloaderException +from bdfr.file_name_formatter import FileNameFormatter +from bdfr.site_authenticator import SiteAuthenticator + + +@pytest.fixture() +def args() -> Configuration: + args = Configuration() + args.time_format = 'ISO' + return args + + +@pytest.fixture() +def downloader_mock(args: Configuration): + downloader_mock = MagicMock() + downloader_mock.args = args + downloader_mock.sanitise_subreddit_name = RedditConnector.sanitise_subreddit_name + downloader_mock.split_args_input = RedditConnector.split_args_input + downloader_mock.master_hash_list = {} + return downloader_mock + + +def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + if result_limit is not None: + assert len(results) == result_limit + return results + + +def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): + downloader_mock.args.directory = tmp_path / 'test' + downloader_mock.config_directories.user_config_dir = tmp_path + RedditConnector.determine_directories(downloader_mock) + assert Path(tmp_path / 'test').exists() + + +@pytest.mark.parametrize(('skip_extensions', 'skip_domains'), ( + ([], []), + (['.test'], ['test.com'],), +)) +def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): + downloader_mock.args.skip_format = skip_extensions + downloader_mock.args.skip_domain = skip_domains + result = RedditConnector.create_download_filter(downloader_mock) + + assert isinstance(result, DownloadFilter) + assert result.excluded_domains == skip_domains + assert result.excluded_extensions == skip_extensions + + +@pytest.mark.parametrize(('test_time', 'expected'), ( + ('all', 'all'), + ('hour', 'hour'), + ('day', 'day'), + ('week', 'week'), + ('random', 'all'), + ('', 'all'), +)) +def test_create_time_filter(test_time: str, expected: str, downloader_mock: MagicMock): + downloader_mock.args.time = test_time + result = RedditConnector.create_time_filter(downloader_mock) + + assert isinstance(result, RedditTypes.TimeType) + assert result.name.lower() == expected + + +@pytest.mark.parametrize(('test_sort', 'expected'), ( + ('', 'hot'), + ('hot', 'hot'), + ('controversial', 'controversial'), + ('new', 'new'), +)) +def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: MagicMock): + downloader_mock.args.sort = test_sort + result = RedditConnector.create_sort_filter(downloader_mock) + + assert isinstance(result, RedditTypes.SortType) + assert result.name.lower() == expected + + +@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( + ('{POSTID}', '{SUBREDDIT}'), + ('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'), + ('{POSTID}', 'test'), + ('{POSTID}', ''), + ('{POSTID}', '{SUBREDDIT}/{REDDITOR}'), +)) +def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): + downloader_mock.args.file_scheme = test_file_scheme + downloader_mock.args.folder_scheme = test_folder_scheme + result = RedditConnector.create_file_name_formatter(downloader_mock) + + assert isinstance(result, FileNameFormatter) + assert result.file_format_string == test_file_scheme + assert result.directory_format_string == test_folder_scheme.split('/') + + +@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( + ('', ''), + ('', '{SUBREDDIT}'), + ('test', '{SUBREDDIT}'), +)) +def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): + downloader_mock.args.file_scheme = test_file_scheme + downloader_mock.args.folder_scheme = test_folder_scheme + with pytest.raises(BulkDownloaderException): + RedditConnector.create_file_name_formatter(downloader_mock) + + +def test_create_authenticator(downloader_mock: MagicMock): + result = RedditConnector.create_authenticator(downloader_mock) + assert isinstance(result, SiteAuthenticator) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_ids', ( + ('lvpf4l',), + ('lvpf4l', 'lvqnsn'), + ('lvpf4l', 'lvqnsn', 'lvl9kd'), +)) +def test_get_submissions_from_link( + test_submission_ids: list[str], + reddit_instance: praw.Reddit, + downloader_mock: MagicMock): + downloader_mock.args.link = test_submission_ids + downloader_mock.reddit_instance = reddit_instance + results = RedditConnector.get_submissions_from_link(downloader_mock) + assert all([isinstance(sub, praw.models.Submission) for res in results for sub in res]) + assert len(results[0]) == len(test_submission_ids) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddits', 'limit', 'sort_type', 'time_filter', 'max_expected_len'), ( + (('Futurology',), 10, 'hot', 'all', 10), + (('Futurology', 'Mindustry, Python'), 10, 'hot', 'all', 30), + (('Futurology',), 20, 'hot', 'all', 20), + (('Futurology', 'Python'), 10, 'hot', 'all', 20), + (('Futurology',), 100, 'hot', 'all', 100), + (('Futurology',), 0, 'hot', 'all', 0), + (('Futurology',), 10, 'top', 'all', 10), + (('Futurology',), 10, 'top', 'week', 10), + (('Futurology',), 10, 'hot', 'week', 10), +)) +def test_get_subreddit_normal( + test_subreddits: list[str], + limit: int, + sort_type: str, + time_filter: str, + max_expected_len: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, +): + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.args.limit = limit + downloader_mock.args.sort = sort_type + downloader_mock.args.subreddit = test_subreddits + downloader_mock.reddit_instance = reddit_instance + downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) + results = RedditConnector.get_subreddits(downloader_mock) + test_subreddits = downloader_mock._split_args_input(test_subreddits) + results = [sub for res1 in results for sub in res1] + assert all([isinstance(res1, praw.models.Submission) for res1 in results]) + assert all([res.subreddit.display_name in test_subreddits for res in results]) + assert len(results) <= max_expected_len + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( + (('Python',), 'scraper', 10, 'all', 10), + (('Python',), '', 10, 'all', 10), + (('Python',), 'djsdsgewef', 10, 'all', 0), + (('Python',), 'scraper', 10, 'year', 10), + (('Python',), 'scraper', 10, 'hour', 1), +)) +def test_get_subreddit_search( + test_subreddits: list[str], + search_term: str, + time_filter: str, + limit: int, + max_expected_len: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, +): + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.args.limit = limit + downloader_mock.args.search = search_term + downloader_mock.args.subreddit = test_subreddits + downloader_mock.reddit_instance = reddit_instance + downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.time = time_filter + downloader_mock.time_filter = RedditConnector.create_time_filter(downloader_mock) + results = RedditConnector.get_subreddits(downloader_mock) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + assert all([res.subreddit.display_name in test_subreddits for res in results]) + assert len(results) <= max_expected_len + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_user', 'test_multireddits', 'limit'), ( + ('helen_darten', ('cuteanimalpics',), 10), + ('korfor', ('chess',), 100), +)) +# Good sources at https://www.reddit.com/r/multihub/ +def test_get_multireddits_public( + test_user: str, + test_multireddits: list[str], + limit: int, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.limit = limit + downloader_mock.args.multireddit = test_multireddits + downloader_mock.args.user = test_user + downloader_mock.reddit_instance = reddit_instance + downloader_mock.create_filtered_listing_generator.return_value = \ + RedditConnector.create_filtered_listing_generator( + downloader_mock, + reddit_instance.multireddit(test_user, test_multireddits[0]), + ) + results = RedditConnector.get_multireddits(downloader_mock) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + assert len(results) == limit + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_user', 'limit'), ( + ('danigirl3694', 10), + ('danigirl3694', 50), + ('CapitanHam', None), +)) +def test_get_user_submissions(test_user: str, limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.args.limit = limit + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.submitted = True + downloader_mock.args.user = test_user + downloader_mock.authenticated = False + downloader_mock.reddit_instance = reddit_instance + downloader_mock.create_filtered_listing_generator.return_value = \ + RedditConnector.create_filtered_listing_generator( + downloader_mock, + reddit_instance.redditor(test_user).submissions, + ) + results = RedditConnector.get_user_data(downloader_mock) + results = assert_all_results_are_submissions(limit, results) + assert all([res.author.name == test_user for res in results]) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.parametrize('test_flag', ( + 'upvoted', + 'saved', +)) +def test_get_user_authenticated_lists( + test_flag: str, + downloader_mock: MagicMock, + authenticated_reddit_instance: praw.Reddit, +): + downloader_mock.args.__dict__[test_flag] = True + downloader_mock.reddit_instance = authenticated_reddit_instance + downloader_mock.args.user = 'me' + downloader_mock.args.limit = 10 + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + RedditConnector.resolve_user_name(downloader_mock) + results = RedditConnector.get_user_data(downloader_mock) + assert_all_results_are_submissions(10, results) + + +@pytest.mark.parametrize(('test_name', 'expected'), ( + ('Mindustry', 'Mindustry'), + ('Futurology', 'Futurology'), + ('r/Mindustry', 'Mindustry'), + ('TrollXChromosomes', 'TrollXChromosomes'), + ('r/TrollXChromosomes', 'TrollXChromosomes'), + ('https://www.reddit.com/r/TrollXChromosomes/', 'TrollXChromosomes'), + ('https://www.reddit.com/r/TrollXChromosomes', 'TrollXChromosomes'), + ('https://www.reddit.com/r/Futurology/', 'Futurology'), + ('https://www.reddit.com/r/Futurology', 'Futurology'), +)) +def test_sanitise_subreddit_name(test_name: str, expected: str): + result = RedditConnector.sanitise_subreddit_name(test_name) + assert result == expected + + +@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), ( + (['test1', 'test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1,test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1, test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1; test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}) +)) +def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): + results = RedditConnector.split_args_input(test_subreddit_entries) + assert results == expected + + +def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): + test_file = tmp_path / 'test.txt' + test_file.write_text('aaaaaa\nbbbbbb') + downloader_mock.args.skip_id_file = [test_file] + results = RedditConnector.read_excluded_ids(downloader_mock) + assert results == {'aaaaaa', 'bbbbbb'} + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_redditor_name', ( + 'Paracortex', + 'crowdstrike', + 'HannibalGoddamnit', +)) +def test_check_user_existence_good( + test_redditor_name: str, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + RedditConnector.check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_redditor_name', ( + 'lhnhfkuhwreolo', + 'adlkfmnhglojh', +)) +def test_check_user_existence_nonexistent( + test_redditor_name: str, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + with pytest.raises(BulkDownloaderException, match='Could not find'): + RedditConnector.check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_redditor_name', ( + 'Bree-Boo', +)) +def test_check_user_existence_banned( + test_redditor_name: str, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock, +): + downloader_mock.reddit_instance = reddit_instance + with pytest.raises(BulkDownloaderException, match='is banned'): + RedditConnector.check_user_existence(downloader_mock, test_redditor_name) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddit_name', 'expected_message'), ( + ('donaldtrump', 'cannot be found'), + ('submitters', 'private and cannot be scraped') +)) +def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): + test_subreddit = reddit_instance.subreddit(test_subreddit_name) + with pytest.raises(BulkDownloaderException, match=expected_message): + RedditConnector.check_subreddit_status(test_subreddit) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_subreddit_name', ( + 'Python', + 'Mindustry', + 'TrollXChromosomes', + 'all', +)) +def test_check_subreddit_status_good(test_subreddit_name: str, reddit_instance: praw.Reddit): + test_subreddit = reddit_instance.subreddit(test_subreddit_name) + RedditConnector.check_subreddit_status(test_subreddit) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index fd56994..ee43625 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -3,20 +3,15 @@ import re from pathlib import Path -from typing import Iterator from unittest.mock import MagicMock -import praw import praw.models import pytest from bdfr.__main__ import setup_logging from bdfr.configuration import Configuration -from bdfr.download_filter import DownloadFilter -from bdfr.downloader import RedditDownloader, RedditTypes -from bdfr.exceptions import BulkDownloaderException -from bdfr.file_name_formatter import FileNameFormatter -from bdfr.site_authenticator import SiteAuthenticator +from bdfr.connector import RedditConnector +from bdfr.downloader import RedditDownloader @pytest.fixture() @@ -30,411 +25,12 @@ def args() -> Configuration: def downloader_mock(args: Configuration): downloader_mock = MagicMock() downloader_mock.args = args - downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name - downloader_mock._split_args_input = RedditDownloader._split_args_input + downloader_mock._sanitise_subreddit_name = RedditConnector.sanitise_subreddit_name + downloader_mock._split_args_input = RedditConnector.split_args_input downloader_mock.master_hash_list = {} return downloader_mock -def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): - results = [sub for res in results for sub in res] - assert all([isinstance(res, praw.models.Submission) for res in results]) - if result_limit is not None: - assert len(results) == result_limit - return results - - -def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): - downloader_mock.args.directory = tmp_path / 'test' - downloader_mock.config_directories.user_config_dir = tmp_path - RedditDownloader._determine_directories(downloader_mock) - assert Path(tmp_path / 'test').exists() - - -@pytest.mark.parametrize(('skip_extensions', 'skip_domains'), ( - ([], []), - (['.test'], ['test.com'],), -)) -def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): - downloader_mock.args.skip_format = skip_extensions - downloader_mock.args.skip_domain = skip_domains - result = RedditDownloader._create_download_filter(downloader_mock) - - assert isinstance(result, DownloadFilter) - assert result.excluded_domains == skip_domains - assert result.excluded_extensions == skip_extensions - - -@pytest.mark.parametrize(('test_time', 'expected'), ( - ('all', 'all'), - ('hour', 'hour'), - ('day', 'day'), - ('week', 'week'), - ('random', 'all'), - ('', 'all'), -)) -def test_create_time_filter(test_time: str, expected: str, downloader_mock: MagicMock): - downloader_mock.args.time = test_time - result = RedditDownloader._create_time_filter(downloader_mock) - - assert isinstance(result, RedditTypes.TimeType) - assert result.name.lower() == expected - - -@pytest.mark.parametrize(('test_sort', 'expected'), ( - ('', 'hot'), - ('hot', 'hot'), - ('controversial', 'controversial'), - ('new', 'new'), -)) -def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: MagicMock): - downloader_mock.args.sort = test_sort - result = RedditDownloader._create_sort_filter(downloader_mock) - - assert isinstance(result, RedditTypes.SortType) - assert result.name.lower() == expected - - -@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( - ('{POSTID}', '{SUBREDDIT}'), - ('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'), - ('{POSTID}', 'test'), - ('{POSTID}', ''), - ('{POSTID}', '{SUBREDDIT}/{REDDITOR}'), -)) -def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): - downloader_mock.args.file_scheme = test_file_scheme - downloader_mock.args.folder_scheme = test_folder_scheme - result = RedditDownloader._create_file_name_formatter(downloader_mock) - - assert isinstance(result, FileNameFormatter) - assert result.file_format_string == test_file_scheme - assert result.directory_format_string == test_folder_scheme.split('/') - - -@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( - ('', ''), - ('', '{SUBREDDIT}'), - ('test', '{SUBREDDIT}'), -)) -def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): - downloader_mock.args.file_scheme = test_file_scheme - downloader_mock.args.folder_scheme = test_folder_scheme - with pytest.raises(BulkDownloaderException): - RedditDownloader._create_file_name_formatter(downloader_mock) - - -def test_create_authenticator(downloader_mock: MagicMock): - result = RedditDownloader._create_authenticator(downloader_mock) - assert isinstance(result, SiteAuthenticator) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_submission_ids', ( - ('lvpf4l',), - ('lvpf4l', 'lvqnsn'), - ('lvpf4l', 'lvqnsn', 'lvl9kd'), -)) -def test_get_submissions_from_link( - test_submission_ids: list[str], - reddit_instance: praw.Reddit, - downloader_mock: MagicMock): - downloader_mock.args.link = test_submission_ids - downloader_mock.reddit_instance = reddit_instance - results = RedditDownloader._get_submissions_from_link(downloader_mock) - assert all([isinstance(sub, praw.models.Submission) for res in results for sub in res]) - assert len(results[0]) == len(test_submission_ids) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_subreddits', 'limit', 'sort_type', 'time_filter', 'max_expected_len'), ( - (('Futurology',), 10, 'hot', 'all', 10), - (('Futurology', 'Mindustry, Python'), 10, 'hot', 'all', 30), - (('Futurology',), 20, 'hot', 'all', 20), - (('Futurology', 'Python'), 10, 'hot', 'all', 20), - (('Futurology',), 100, 'hot', 'all', 100), - (('Futurology',), 0, 'hot', 'all', 0), - (('Futurology',), 10, 'top', 'all', 10), - (('Futurology',), 10, 'top', 'week', 10), - (('Futurology',), 10, 'hot', 'week', 10), -)) -def test_get_subreddit_normal( - test_subreddits: list[str], - limit: int, - sort_type: str, - time_filter: str, - max_expected_len: int, - downloader_mock: MagicMock, - reddit_instance: praw.Reddit, -): - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.args.limit = limit - downloader_mock.args.sort = sort_type - downloader_mock.args.subreddit = test_subreddits - downloader_mock.reddit_instance = reddit_instance - downloader_mock.sort_filter = RedditDownloader._create_sort_filter(downloader_mock) - results = RedditDownloader._get_subreddits(downloader_mock) - test_subreddits = downloader_mock._split_args_input(test_subreddits) - results = [sub for res1 in results for sub in res1] - assert all([isinstance(res1, praw.models.Submission) for res1 in results]) - assert all([res.subreddit.display_name in test_subreddits for res in results]) - assert len(results) <= max_expected_len - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( - (('Python',), 'scraper', 10, 'all', 10), - (('Python',), '', 10, 'all', 10), - (('Python',), 'djsdsgewef', 10, 'all', 0), - (('Python',), 'scraper', 10, 'year', 10), - (('Python',), 'scraper', 10, 'hour', 1), -)) -def test_get_subreddit_search( - test_subreddits: list[str], - search_term: str, - time_filter: str, - limit: int, - max_expected_len: int, - downloader_mock: MagicMock, - reddit_instance: praw.Reddit, -): - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.args.limit = limit - downloader_mock.args.search = search_term - downloader_mock.args.subreddit = test_subreddits - downloader_mock.reddit_instance = reddit_instance - downloader_mock.sort_filter = RedditTypes.SortType.HOT - downloader_mock.args.time = time_filter - downloader_mock.time_filter = RedditDownloader._create_time_filter(downloader_mock) - results = RedditDownloader._get_subreddits(downloader_mock) - results = [sub for res in results for sub in res] - assert all([isinstance(res, praw.models.Submission) for res in results]) - assert all([res.subreddit.display_name in test_subreddits for res in results]) - assert len(results) <= max_expected_len - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_user', 'test_multireddits', 'limit'), ( - ('helen_darten', ('cuteanimalpics',), 10), - ('korfor', ('chess',), 100), -)) -# Good sources at https://www.reddit.com/r/multihub/ -def test_get_multireddits_public( - test_user: str, - test_multireddits: list[str], - limit: int, - reddit_instance: praw.Reddit, - downloader_mock: MagicMock, -): - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.sort_filter = RedditTypes.SortType.HOT - downloader_mock.args.limit = limit - downloader_mock.args.multireddit = test_multireddits - downloader_mock.args.user = test_user - downloader_mock.reddit_instance = reddit_instance - downloader_mock._create_filtered_listing_generator.return_value = \ - RedditDownloader._create_filtered_listing_generator( - downloader_mock, - reddit_instance.multireddit(test_user, test_multireddits[0]), - ) - results = RedditDownloader._get_multireddits(downloader_mock) - results = [sub for res in results for sub in res] - assert all([isinstance(res, praw.models.Submission) for res in results]) - assert len(results) == limit - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_user', 'limit'), ( - ('danigirl3694', 10), - ('danigirl3694', 50), - ('CapitanHam', None), -)) -def test_get_user_submissions(test_user: str, limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.args.limit = limit - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.sort_filter = RedditTypes.SortType.HOT - downloader_mock.args.submitted = True - downloader_mock.args.user = test_user - downloader_mock.authenticated = False - downloader_mock.reddit_instance = reddit_instance - downloader_mock._create_filtered_listing_generator.return_value = \ - RedditDownloader._create_filtered_listing_generator( - downloader_mock, - reddit_instance.redditor(test_user).submissions, - ) - results = RedditDownloader._get_user_data(downloader_mock) - results = assert_all_results_are_submissions(limit, results) - assert all([res.author.name == test_user for res in results]) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.authenticated -@pytest.mark.parametrize('test_flag', ( - 'upvoted', - 'saved', -)) -def test_get_user_authenticated_lists( - test_flag: str, - downloader_mock: MagicMock, - authenticated_reddit_instance: praw.Reddit, -): - downloader_mock.args.__dict__[test_flag] = True - downloader_mock.reddit_instance = authenticated_reddit_instance - downloader_mock.args.user = 'me' - downloader_mock.args.limit = 10 - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.sort_filter = RedditTypes.SortType.HOT - RedditDownloader._resolve_user_name(downloader_mock) - results = RedditDownloader._get_user_data(downloader_mock) - assert_all_results_are_submissions(10, results) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'expected_files_len'), ( - ('ljyy27', 4), -)) -def test_download_submission( - test_submission_id: str, - expected_files_len: int, - downloader_mock: MagicMock, - reddit_instance: praw.Reddit, - tmp_path: Path): - downloader_mock.reddit_instance = reddit_instance - downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.folder_scheme = '' - downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - downloader_mock.download_directory = tmp_path - submission = downloader_mock.reddit_instance.submission(id=test_submission_id) - RedditDownloader._download_submission(downloader_mock, submission) - folder_contents = list(tmp_path.iterdir()) - assert len(folder_contents) == expected_files_len - - -@pytest.mark.online -@pytest.mark.reddit -def test_download_submission_file_exists( - downloader_mock: MagicMock, - reddit_instance: praw.Reddit, - tmp_path: Path, - capsys: pytest.CaptureFixture -): - setup_logging(3) - downloader_mock.reddit_instance = reddit_instance - downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.folder_scheme = '' - downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - downloader_mock.download_directory = tmp_path - submission = downloader_mock.reddit_instance.submission(id='m1hqw6') - Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch() - RedditDownloader._download_submission(downloader_mock, submission) - folder_contents = list(tmp_path.iterdir()) - output = capsys.readouterr() - assert len(folder_contents) == 1 - assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png already exists' in output.out - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'test_hash'), ( - ('m1hqw6', 'a912af8905ae468e0121e9940f797ad7'), -)) -def test_download_submission_hash_exists( - test_submission_id: str, - test_hash: str, - downloader_mock: MagicMock, - reddit_instance: praw.Reddit, - tmp_path: Path, - capsys: pytest.CaptureFixture -): - setup_logging(3) - downloader_mock.reddit_instance = reddit_instance - downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.folder_scheme = '' - downloader_mock.args.no_dupes = True - downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = {test_hash: None} - submission = downloader_mock.reddit_instance.submission(id=test_submission_id) - RedditDownloader._download_submission(downloader_mock, submission) - folder_contents = list(tmp_path.iterdir()) - output = capsys.readouterr() - assert len(folder_contents) == 0 - assert re.search(r'Resource hash .*? downloaded elsewhere', output.out) - - -@pytest.mark.parametrize(('test_name', 'expected'), ( - ('Mindustry', 'Mindustry'), - ('Futurology', 'Futurology'), - ('r/Mindustry', 'Mindustry'), - ('TrollXChromosomes', 'TrollXChromosomes'), - ('r/TrollXChromosomes', 'TrollXChromosomes'), - ('https://www.reddit.com/r/TrollXChromosomes/', 'TrollXChromosomes'), - ('https://www.reddit.com/r/TrollXChromosomes', 'TrollXChromosomes'), - ('https://www.reddit.com/r/Futurology/', 'Futurology'), - ('https://www.reddit.com/r/Futurology', 'Futurology'), -)) -def test_sanitise_subreddit_name(test_name: str, expected: str): - result = RedditDownloader._sanitise_subreddit_name(test_name) - assert result == expected - - -def test_search_existing_files(): - results = RedditDownloader.scan_existing_files(Path('.')) - assert len(results.keys()) >= 40 - - -@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), ( - (['test1', 'test2', 'test3'], {'test1', 'test2', 'test3'}), - (['test1,test2', 'test3'], {'test1', 'test2', 'test3'}), - (['test1, test2', 'test3'], {'test1', 'test2', 'test3'}), - (['test1; test2', 'test3'], {'test1', 'test2', 'test3'}), - (['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}) -)) -def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): - results = RedditDownloader._split_args_input(test_subreddit_entries) - assert results == expected - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_submission_id', ( - 'm1hqw6', -)) -def test_mark_hard_link( - test_submission_id: str, - downloader_mock: MagicMock, - tmp_path: Path, - reddit_instance: praw.Reddit -): - downloader_mock.reddit_instance = reddit_instance - downloader_mock.args.make_hard_links = True - downloader_mock.download_directory = tmp_path - downloader_mock.args.folder_scheme = '' - downloader_mock.args.file_scheme = '{POSTID}' - downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - submission = downloader_mock.reddit_instance.submission(id=test_submission_id) - original = Path(tmp_path, f'{test_submission_id}.png') - - RedditDownloader._download_submission(downloader_mock, submission) - assert original.exists() - - downloader_mock.args.file_scheme = 'test2_{POSTID}' - downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - RedditDownloader._download_submission(downloader_mock, submission) - test_file_1_stats = original.stat() - test_file_2_inode = Path(tmp_path, f'test2_{test_submission_id}.png').stat().st_ino - - assert test_file_1_stats.st_nlink == 2 - assert test_file_1_stats.st_ino == test_file_2_inode - - @pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), ( (('aaaaaa',), (), 1), (('aaaaaa',), ('aaaaaa',), 0), @@ -453,81 +49,113 @@ def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_ assert downloader_mock._download_submission.call_count == expected_len -def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): - test_file = tmp_path / 'test.txt' - test_file.write_text('aaaaaa\nbbbbbb') - downloader_mock.args.skip_id_file = [test_file] - results = RedditDownloader._read_excluded_ids(downloader_mock) - assert results == {'aaaaaa', 'bbbbbb'} - - @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize('test_redditor_name', ( - 'Paracortex', - 'crowdstrike', - 'HannibalGoddamnit', +@pytest.mark.parametrize('test_submission_id', ( + 'm1hqw6', )) -def test_check_user_existence_good( - test_redditor_name: str, - reddit_instance: praw.Reddit, - downloader_mock: MagicMock, -): - downloader_mock.reddit_instance = reddit_instance - RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_redditor_name', ( - 'lhnhfkuhwreolo', - 'adlkfmnhglojh', -)) -def test_check_user_existence_nonexistent( - test_redditor_name: str, - reddit_instance: praw.Reddit, +def test_mark_hard_link( + test_submission_id: str, downloader_mock: MagicMock, + tmp_path: Path, + reddit_instance: praw.Reddit ): downloader_mock.reddit_instance = reddit_instance - with pytest.raises(BulkDownloaderException, match='Could not find'): - RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + downloader_mock.args.make_hard_links = True + downloader_mock.download_directory = tmp_path + downloader_mock.args.folder_scheme = '' + downloader_mock.args.file_scheme = '{POSTID}' + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + original = Path(tmp_path, f'{test_submission_id}.png') + + RedditDownloader._download_submission(downloader_mock, submission) + assert original.exists() + + downloader_mock.args.file_scheme = 'test2_{POSTID}' + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + RedditDownloader._download_submission(downloader_mock, submission) + test_file_1_stats = original.stat() + test_file_2_inode = Path(tmp_path, f'test2_{test_submission_id}.png').stat().st_ino + + assert test_file_1_stats.st_nlink == 2 + assert test_file_1_stats.st_ino == test_file_2_inode + + +def test_search_existing_files(): + results = RedditDownloader.scan_existing_files(Path('.')) + assert len(results.keys()) >= 40 @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize('test_redditor_name', ( - 'Bree-Boo', +@pytest.mark.parametrize(('test_submission_id', 'test_hash'), ( + ('m1hqw6', 'a912af8905ae468e0121e9940f797ad7'), )) -def test_check_user_existence_banned( - test_redditor_name: str, - reddit_instance: praw.Reddit, +def test_download_submission_hash_exists( + test_submission_id: str, + test_hash: str, downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture ): + setup_logging(3) downloader_mock.reddit_instance = reddit_instance - with pytest.raises(BulkDownloaderException, match='is banned'): - RedditDownloader._check_user_existence(downloader_mock, test_redditor_name) + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.no_dupes = True + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + downloader_mock.master_hash_list = {test_hash: None} + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + output = capsys.readouterr() + assert len(folder_contents) == 0 + assert re.search(r'Resource hash .*? downloaded elsewhere', output.out) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_subreddit_name', 'expected_message'), ( - ('donaldtrump', 'cannot be found'), - ('submitters', 'private and cannot be scraped') -)) -def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): - test_subreddit = reddit_instance.subreddit(test_subreddit_name) - with pytest.raises(BulkDownloaderException, match=expected_message): - RedditDownloader._check_subreddit_status(test_subreddit) +def test_download_submission_file_exists( + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch() + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + output = capsys.readouterr() + assert len(folder_contents) == 1 + assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png already exists' in output.out @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize('test_subreddit_name', ( - 'Python', - 'Mindustry', - 'TrollXChromosomes', - 'all', +@pytest.mark.parametrize(('test_submission_id', 'expected_files_len'), ( + ('ljyy27', 4), )) -def test_check_subreddit_status_good(test_subreddit_name: str, reddit_instance: praw.Reddit): - test_subreddit = reddit_instance.subreddit(test_subreddit_name) - RedditDownloader._check_subreddit_status(test_subreddit) +def test_download_submission( + test_submission_id: str, + expected_files_len: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + assert len(folder_contents) == expected_files_len From fa04d61eb8e23b94f75cb65c1cf8b4dfdc868b09 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 17 May 2021 11:04:24 +1000 Subject: [PATCH 013/150] Refactor archiver tests --- tests/test_archiver.py | 47 +++++++----------------------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/tests/test_archiver.py b/tests/test_archiver.py index 622c555..627caee 100644 --- a/tests/test_archiver.py +++ b/tests/test_archiver.py @@ -7,51 +7,20 @@ from unittest.mock import MagicMock import praw import pytest -from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry from bdfr.archiver import Archiver @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize('test_submission_id', ( - 'm3reby', +@pytest.mark.parametrize(('test_submission_id', 'test_format'), ( + ('m3reby', 'xml'), + ('m3reby', 'json'), + ('m3reby', 'yaml'), )) -def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): +def test_write_submission_json(test_submission_id: str, tmp_path: Path, test_format: str, reddit_instance: praw.Reddit): archiver_mock = MagicMock() - test_path = Path(tmp_path, 'test.json') + archiver_mock.args.format = test_format + test_path = Path(tmp_path, 'test') test_submission = reddit_instance.submission(id=test_submission_id) archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = SubmissionArchiveEntry(test_submission) - Archiver._write_entry_json(archiver_mock, test_entry) - archiver_mock._write_content_to_disk.assert_called_once() - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_submission_id', ( - 'm3reby', -)) -def test_write_submission_xml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): - archiver_mock = MagicMock() - test_path = Path(tmp_path, 'test.xml') - test_submission = reddit_instance.submission(id=test_submission_id) - archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = SubmissionArchiveEntry(test_submission) - Archiver._write_entry_xml(archiver_mock, test_entry) - archiver_mock._write_content_to_disk.assert_called_once() - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_submission_id', ( - 'm3reby', -)) -def test_write_submission_yaml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): - archiver_mock = MagicMock() - archiver_mock.download_directory = tmp_path - test_path = Path(tmp_path, 'test.yaml') - test_submission = reddit_instance.submission(id=test_submission_id) - archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = SubmissionArchiveEntry(test_submission) - Archiver._write_entry_yaml(archiver_mock, test_entry) - archiver_mock._write_content_to_disk.assert_called_once() + Archiver.write_entry(archiver_mock, test_submission) From c581bef790f0be1258c6c2c9e637b8811dd4fc82 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Mon, 17 May 2021 20:49:35 +1000 Subject: [PATCH 014/150] Set file creation times to the post creation time (#391) --- bdfr/downloader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 6fa37d6..cfb653b 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -10,6 +10,7 @@ import os import re import shutil import socket +import time from datetime import datetime from enum import Enum, auto from multiprocessing import Pool @@ -440,6 +441,8 @@ class RedditDownloader: with open(destination, 'wb') as file: file.write(res.content) logger.debug(f'Written file to {destination}') + creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) + os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination logger.debug(f'Hash added to master list: {resource_hash}') logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') From a75cc0dee9197fdc3ddb6293dbb107bb413c82d6 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Mon, 17 May 2021 14:18:08 +0300 Subject: [PATCH 015/150] Add file modified date test --- tests/test_downloader.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ee43625..ad4e1f8 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -2,6 +2,7 @@ # coding=utf-8 import re +import os from pathlib import Path from unittest.mock import MagicMock @@ -82,6 +83,32 @@ def test_mark_hard_link( assert test_file_1_stats.st_ino == test_file_2_inode +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'test_creation_date'), ( + ('ndzz50', 1621204841.0), +)) +def test_file_creation_date( + test_submission_id: str, + test_creation_date: float, + downloader_mock: MagicMock, + tmp_path: Path, + reddit_instance: praw.Reddit +): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_directory = tmp_path + downloader_mock.args.folder_scheme = '' + downloader_mock.args.file_scheme = '{POSTID}' + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + + RedditDownloader._download_submission(downloader_mock, submission) + + for file_path in Path(tmp_path).iterdir(): + file_stats = os.stat(file_path) + assert file_stats.st_mtime == test_creation_date + + def test_search_existing_files(): results = RedditDownloader.scan_existing_files(Path('.')) assert len(results.keys()) >= 40 From 122aa2839bbfc938799e1eaf97b2b5ed6a43f29b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 18 May 2021 12:02:51 +1000 Subject: [PATCH 016/150] Remove bad test case --- tests/site_downloaders/test_erome.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 1758eed..84546c4 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -41,12 +41,6 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): 'a1abf398cfd4ef9cfaf093ceb10c746a', 'bd9e1a4ea5ef0d6ba47fb90e337c2d14' }), - ('https://www.erome.com/a/IK5HADyi', { - '3b2a441ff821c09d9b629271a8b0f19f', - '470343fa67fd2ef9687c4223d278f761', - '7fbbc092939919aa74a710ddd26adc02', - 'c7299a73e019ab635b47c863fe3cd473', - }) )) def test_download_resource(test_url: str, expected_hashes: tuple[str]): # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash From 32f72c35ec68350bb9e3c91d5d0a2db21d298fc8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 18 May 2021 13:58:07 +1000 Subject: [PATCH 017/150] Update script to catch more failed IDs --- scripts/extract_failed_ids.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index cdf1f21..033ecac 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -14,5 +14,8 @@ else output="failed.txt" fi -grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev >>"$output" -grep 'Failed to download resource' "$file" | awk '{ print $15 }' >>"$output" +{ + grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; \ + grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; \ + grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; \ +} >>"$output" From 3b28ad24b353197a0d574f161bf12c497a19a0dd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 19 May 2021 09:55:03 +1000 Subject: [PATCH 018/150] Fix bug with some Imgur extensions --- bdfr/site_downloaders/imgur.py | 1 + tests/site_downloaders/test_imgur.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 6ae8a5e..3d071d4 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -71,6 +71,7 @@ class Imgur(BaseDownloader): @staticmethod def _validate_extension(extension_suffix: str) -> str: + extension_suffix = extension_suffix.strip('?1') possible_extensions = ('.jpg', '.png', '.mp4', '.gif') selection = [ext for ext in possible_extensions if ext == extension_suffix] if len(selection) == 1: diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index ee98c42..792926a 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -122,6 +122,14 @@ def test_imgur_extension_validation_bad(test_extension: str): '029c475ce01b58fdf1269d8771d33913', ), ), + ( + 'https://imgur.com/a/eemHCCK', + ( + '9cb757fd8f055e7ef7aa88addc9d9fa5', + 'b6cb6c918e2544e96fb7c07d828774b5', + 'fb6c913d721c0bbb96aa65d7f560d385', + ), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() @@ -131,5 +139,4 @@ def test_find_resources(test_url: str, expected_hashes: list[str]): assert all([isinstance(res, Resource) for res in results]) [res.download(120) for res in results] hashes = set([res.hash.hexdigest() for res in results]) - assert len(results) == len(expected_hashes) assert hashes == set(expected_hashes) From 830e4f283035bed3f7a78e9354abe7ec488e7b11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 19 May 2021 10:07:55 +1000 Subject: [PATCH 019/150] Catch additional error --- bdfr/resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 966f5ba..e660d33 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -5,8 +5,8 @@ import hashlib import logging import re import time -from typing import Optional import urllib.parse +from typing import Optional import _hashlib import requests @@ -39,7 +39,7 @@ class Resource: else: raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - except requests.exceptions.ConnectionError as e: + except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}') time.sleep(wait_time) if wait_time < max_wait_time: From 8fb5103d09532edd143a561630799f546ddf3a4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Wed, 19 May 2021 13:58:53 +0300 Subject: [PATCH 020/150] Adds download count badge --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cf5269c..15d50c2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Bulk Downloader for Reddit -[![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) -[![PyPI version](https://badge.fury.io/py/bdfr.svg)](https://badge.fury.io/py/bdfr) +[![PyPI version](https://img.shields.io/pypi/v/bdfr.svg)](https://pypi.python.org/pypi/bdfr) +[![PyPI downloads](https://img.shields.io/pypi/dm/bdfr)](https://pypi.python.org/pypi/bdfr) +[![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources) From 32b29f541373d5d81524be7557f22993e3fa4825 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Wed, 19 May 2021 21:59:38 +0300 Subject: [PATCH 021/150] Ignore IDE files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 7207598..3918aa5 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,6 @@ cython_debug/ # Test configuration file test_config.cfg + +.vscode/ +.idea/ \ No newline at end of file From 827f1ab80eab43ec9989d41d90880f8ddec91f96 Mon Sep 17 00:00:00 2001 From: Ailothaen Date: Tue, 18 May 2021 22:30:12 +0200 Subject: [PATCH 022/150] Adding some more info in threads and comments: distinguished, spoiler, locked, sticky --- bdfr/archive_entry/base_archive_entry.py | 1 + bdfr/archive_entry/submission_archive_entry.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/bdfr/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py index 775ed68..7b84fbe 100644 --- a/bdfr/archive_entry/base_archive_entry.py +++ b/bdfr/archive_entry/base_archive_entry.py @@ -26,6 +26,7 @@ class BaseArchiveEntry(ABC): 'stickied': in_comment.stickied, 'body': in_comment.body, 'is_submitter': in_comment.is_submitter, + 'distinguished': in_comment.distinguished, 'created_utc': in_comment.created_utc, 'parent_id': in_comment.parent_id, 'replies': [], diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index aaa423b..538aea8 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -35,6 +35,10 @@ class SubmissionArchiveEntry(BaseArchiveEntry): 'link_flair_text': self.source.link_flair_text, 'num_comments': self.source.num_comments, 'over_18': self.source.over_18, + 'spoiler': self.source.spoiler, + 'pinned': self.source.pinned, + 'locked': self.source.locked, + 'distinguished': self.source.distinguished, 'created_utc': self.source.created_utc, } From 9d6e54148be973cafdd313709394105bdd4be663 Mon Sep 17 00:00:00 2001 From: Ailothaen Date: Thu, 20 May 2021 22:41:02 +0200 Subject: [PATCH 023/150] Added tests for new data in threads/comments --- tests/archive_entry/test_comment_archive_entry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/archive_entry/test_comment_archive_entry.py b/tests/archive_entry/test_comment_archive_entry.py index 27dfcb3..e453d27 100644 --- a/tests/archive_entry/test_comment_archive_entry.py +++ b/tests/archive_entry/test_comment_archive_entry.py @@ -15,6 +15,7 @@ from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry 'subreddit': 'Python', 'submission': 'mgi4op', 'submission_title': '76% Faster CPython', + 'distinguished': None, }), )) def test_get_comment_details(test_comment_id: str, expected_dict: dict, reddit_instance: praw.Reddit): From f4c1adaa9a836a7f3ab4fa35a003fe417dedf72c Mon Sep 17 00:00:00 2001 From: Ailothaen Date: Thu, 20 May 2021 22:47:33 +0200 Subject: [PATCH 024/150] Added tests for new data in threads/comments (2/2) --- tests/archive_entry/test_submission_archive_entry.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index 2b1bb72..60f47b5 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -26,6 +26,13 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc 'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image', + 'pinned': False, + 'spoiler': False, + 'over_18': False, + 'locked': False, + 'distinguished': None, + 'created_utc': 1615583837, + 'permalink': '/r/australia/comments/m3reby/this_little_guy_fell_out_of_a_tree_and_in_front/' }), ('m3kua3', {'author': 'DELETED'}), )) From cf6905db282482eafae147b7eca605b1e4fd51f4 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Fri, 21 May 2021 00:14:35 +0300 Subject: [PATCH 025/150] Reverts #384 --- README.md | 6 +++--- bdfr/__main__.py | 6 +++--- bdfr/configuration.py | 6 +++--- bdfr/connector.py | 6 +++--- scripts/README.md | 2 +- tests/test_connector.py | 4 ++-- tests/test_integration.py | 6 +++--- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index f89ef23..9f32599 100644 --- a/README.md +++ b/README.md @@ -151,17 +151,17 @@ The following options apply only to the `download` command. This command downloa - Sets the scheme for folders - Default is `{SUBREDDIT}` - See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details -- `--skip-id` +- `--exclude-id` - This will skip the download of any submission with the ID provided - Can be specified multiple times -- `--skip-id-file` +- `--exclude-id-file` - This will skip the download of any submission with any of the IDs in the files provided - Can be specified multiple times - Format is one ID per line - `--skip-domain` - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded - Can be specified multiple times -- `--skip-format` +- `--skip` - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded - Can be specified multiple times - `--skip-subreddit` diff --git a/bdfr/__main__.py b/bdfr/__main__.py index bafa93c..28ef207 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -51,9 +51,9 @@ def cli(): @click.option('--max-wait-time', type=int, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) -@click.option('--skip-id', default=None, multiple=True) -@click.option('--skip-id-file', default=None, multiple=True) -@click.option('--skip-format', default=None, multiple=True) +@click.option('--exclude-id', default=None, multiple=True) +@click.option('--exclude-id-file', default=None, multiple=True) +@click.option('--skip', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @click.option('--skip-subreddit', default=None, multiple=True) @_add_common_options diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 8cb8f10..9ab9d45 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -13,8 +13,8 @@ class Configuration(Namespace): self.authenticate = False self.config = None self.directory: str = '.' - self.skip_id = [] - self.skip_id_file = [] + self.exclude_id = [] + self.exclude_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.log: Optional[str] = None @@ -26,7 +26,7 @@ class Configuration(Namespace): self.search_existing: bool = False self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.folder_scheme: str = '{SUBREDDIT}' - self.skip_format: list[str] = [] + self.skip: list[str] = [] self.skip_domain: list[str] = [] self.skip_subreddit: list[str] = [] self.sort: str = 'hot' diff --git a/bdfr/connector.py b/bdfr/connector.py index 3dcc118..c20b749 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -367,7 +367,7 @@ class RedditConnector(metaclass=ABCMeta): return RedditTypes.SortType.HOT def create_download_filter(self) -> DownloadFilter: - return DownloadFilter(self.args.skip_format, self.args.skip_domain) + return DownloadFilter(self.args.skip, self.args.skip_domain) def create_authenticator(self) -> SiteAuthenticator: return SiteAuthenticator(self.cfg_parser) @@ -389,8 +389,8 @@ class RedditConnector(metaclass=ABCMeta): def read_excluded_ids(self) -> set[str]: out = [] - out.extend(self.args.skip_id) - for id_file in self.args.skip_id_file: + out.extend(self.args.exclude_id) + for id_file in self.args.exclude_id_file: id_file = Path(id_file).resolve().expanduser() if not id_file.exists(): logger.warning(f'ID exclusion file at {id_file} does not exist') diff --git a/scripts/README.md b/scripts/README.md index 51e51bb..4bb098b 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -9,7 +9,7 @@ Due to the verboseness of the logs, a great deal of information can be gathered ## Extract all Successfully Downloaded IDs -This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--skip-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. +This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--exclude-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources. The script can be used with the following signature: diff --git a/tests/test_connector.py b/tests/test_connector.py index 41d9115..1078707 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -54,7 +54,7 @@ def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): (['.test'], ['test.com'],), )) def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): - downloader_mock.args.skip_format = skip_extensions + downloader_mock.args.skip = skip_extensions downloader_mock.args.skip_domain = skip_domains result = RedditConnector.create_download_filter(downloader_mock) @@ -324,7 +324,7 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): test_file = tmp_path / 'test.txt' test_file.write_text('aaaaaa\nbbbbbb') - downloader_mock.args.skip_id_file = [test_file] + downloader_mock.args.exclude_id_file = [test_file] results = RedditConnector.read_excluded_ids(downloader_mock) assert results == {'aaaaaa', 'bbbbbb'} diff --git a/tests/test_integration.py b/tests/test_integration.py index 419464f..7aec0eb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -166,8 +166,8 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--subreddit', 'tumblr', '-L', '25', '--skip-format', 'png', '--skip-format', 'jpg'], - ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip-format', 'txt'], + ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], + ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'], )) def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -299,7 +299,7 @@ def test_cli_download_use_default_config(tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--skip-id', 'm2601g'], + ['-l', 'm2601g', '--exclude-id', 'm2601g'], )) def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() From da8c64ec51cf406250e4a4946a4930232bcb737c Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Fri, 21 May 2021 21:41:57 +0300 Subject: [PATCH 026/150] Read files in chunks instead when hashing (#416) --- bdfr/downloader.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 3c157f1..7973733 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -22,9 +22,15 @@ logger = logging.getLogger(__name__) def _calc_hash(existing_file: Path): + CHUNK_SIZE = 1024 * 1024 + md5_hash = hashlib.md5() with open(existing_file, 'rb') as file: - file_hash = hashlib.md5(file.read()).hexdigest() - return existing_file, file_hash + chunk = file.read(CHUNK_SIZE) + while chunk: + md5_hash.update(chunk) + chunk = file.read(CHUNK_SIZE) + file_hash = md5_hash.hexdigest() + return existing_file, file_hash class RedditDownloader(RedditConnector): From a104a154fc77171ab69d93f0c6ecb5c284f9c81a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 21 May 2021 16:50:05 +1000 Subject: [PATCH 027/150] Simplify method structure --- bdfr/downloader.py | 53 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 7973733..2e47ae9 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -71,35 +71,36 @@ class RedditDownloader(RedditConnector): for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): logger.debug(f'File {destination} already exists, continuing') + continue elif not self.download_filter.check_resource(res): logger.debug(f'Download filter removed {submission.id} with URL {submission.url}') - else: - try: - res.download(self.args.max_wait_time) - except errors.BulkDownloaderException as e: - logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' - f'with downloader {downloader_class.__name__}: {e}') + continue + try: + res.download(self.args.max_wait_time) + except errors.BulkDownloaderException as e: + logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' + f'with downloader {downloader_class.__name__}: {e}') + return + resource_hash = res.hash.hexdigest() + destination.parent.mkdir(parents=True, exist_ok=True) + if resource_hash in self.master_hash_list: + if self.args.no_dupes: + logger.info( + f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere') return - resource_hash = res.hash.hexdigest() - destination.parent.mkdir(parents=True, exist_ok=True) - if resource_hash in self.master_hash_list: - if self.args.no_dupes: - logger.info( - f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere') - return - elif self.args.make_hard_links: - self.master_hash_list[resource_hash].link_to(destination) - logger.info( - f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') - return - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') - creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) - os.utime(destination, (creation_time, creation_time)) - self.master_hash_list[resource_hash] = destination - logger.debug(f'Hash added to master list: {resource_hash}') - logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') + elif self.args.make_hard_links: + self.master_hash_list[resource_hash].link_to(destination) + logger.info( + f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') + return + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) + os.utime(destination, (creation_time, creation_time)) + self.master_hash_list[resource_hash] = destination + logger.debug(f'Hash added to master list: {resource_hash}') + logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') @staticmethod def scan_existing_files(directory: Path) -> dict[str, Path]: From da72d8ac2d849620332594a5d65c095e4504c076 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 21 May 2021 17:02:43 +1000 Subject: [PATCH 028/150] Remove unneeded backslashes --- scripts/extract_failed_ids.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 033ecac..7108592 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -15,7 +15,7 @@ else fi { - grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; \ - grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; \ - grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; \ + grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; + grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; + grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; } >>"$output" From 4395dd46469d47ac188ffcbf9d556031db431a00 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 22 May 2021 11:47:48 +1000 Subject: [PATCH 029/150] Update logging messages to include submission IDs --- bdfr/downloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 2e47ae9..cc4e8bb 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -70,7 +70,7 @@ class RedditDownloader(RedditConnector): return for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): - logger.debug(f'File {destination} already exists, continuing') + logger.debug(f'File {destination} from submission {submission.id} already exists, continuing') continue elif not self.download_filter.check_resource(res): logger.debug(f'Download filter removed {submission.id} with URL {submission.url}') @@ -91,7 +91,8 @@ class RedditDownloader(RedditConnector): elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to(destination) logger.info( - f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') + f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}' + f' in submission {submission.id}') return with open(destination, 'wb') as file: file.write(res.content) From 527a8af7b7fbd33b280bfd8072c70e15808ba063 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 22 May 2021 11:47:59 +1000 Subject: [PATCH 030/150] Update script to extract IDs --- scripts/extract_successful_ids.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 3b6f7bc..6c7930d 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -14,4 +14,9 @@ else output="successful.txt" fi -grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' >> "$output" +{ + grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' ; + grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ; + grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; + grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; +} >> "$output" From 5aae6b3df88b21216fc1a756b98f5a21fbe9607a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 22 May 2021 11:53:12 +1000 Subject: [PATCH 031/150] Add another message type --- scripts/extract_successful_ids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 6c7930d..19e8bd7 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -19,4 +19,5 @@ fi grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ; grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; + grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; } >> "$output" From 47a49512798d0a32642eec37dc9c880398da9e6e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 23 May 2021 12:13:44 +1000 Subject: [PATCH 032/150] Rename variable --- bdfr/downloader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index cc4e8bb..4a13823 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -22,13 +22,13 @@ logger = logging.getLogger(__name__) def _calc_hash(existing_file: Path): - CHUNK_SIZE = 1024 * 1024 + chunk_size = 1024 * 1024 md5_hash = hashlib.md5() with open(existing_file, 'rb') as file: - chunk = file.read(CHUNK_SIZE) + chunk = file.read(chunk_size) while chunk: md5_hash.update(chunk) - chunk = file.read(CHUNK_SIZE) + chunk = file.read(chunk_size) file_hash = md5_hash.hexdigest() return existing_file, file_hash From e2582ecb3eafd8df20bbb9b616669b71131ecc97 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 23 May 2021 12:17:14 +1000 Subject: [PATCH 033/150] Catch error with MacOS writing per issue #407 --- bdfr/downloader.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 4a13823..a262dae 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -94,9 +94,13 @@ class RedditDownloader(RedditConnector): f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}' f' in submission {submission.id}') return - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') + try: + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + except OSError as e: + logger.exception(e) + logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination From 1b23e38ce4793c85d76bf8f551b772010dc03fd5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 23 May 2021 12:25:04 +1000 Subject: [PATCH 034/150] Update script to include new message --- scripts/extract_failed_ids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 7108592..89f1896 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -18,4 +18,5 @@ fi grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; } >>"$output" From 4c42469c0cb2d923b4a981584c55b3c70f05e6d5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 23 May 2021 12:40:27 +1000 Subject: [PATCH 035/150] Update failing tests --- tests/test_downloader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ad4e1f8..b4f175d 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # coding=utf-8 -import re import os +import re from pathlib import Path from unittest.mock import MagicMock @@ -111,7 +111,7 @@ def test_file_creation_date( def test_search_existing_files(): results = RedditDownloader.scan_existing_files(Path('.')) - assert len(results.keys()) >= 40 + assert len(results.keys()) != 0 @pytest.mark.online @@ -163,7 +163,8 @@ def test_download_submission_file_exists( folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() assert len(folder_contents) == 1 - assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png already exists' in output.out + assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png'\ + ' from submission m1hqw6 already exists' in output.out @pytest.mark.online From 323b2d2b030325c65ef15465ad155c9c5307f573 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 09:56:22 +1000 Subject: [PATCH 036/150] Fix download retries logic --- bdfr/resource.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index e660d33..e8f9fd1 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -28,8 +28,7 @@ class Resource: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, max_wait_time: int) -> Optional[bytes]: - wait_time = 60 + def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]: try: response = requests.get(url) if re.match(r'^2\d{2}', str(response.status_code)) and response.content: @@ -40,10 +39,11 @@ class Resource: raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: - logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}') - time.sleep(wait_time) - if wait_time < max_wait_time: - return Resource.retry_download(url, max_wait_time) + logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') + time.sleep(current_wait_time) + if current_wait_time < max_wait_time: + current_wait_time += 60 + return Resource.retry_download(url, max_wait_time, current_wait_time) else: logger.error(f'Max wait time exceeded for resource at url {url}') raise From b74e93d2b792b0a97310e2a778f90fa51934c35c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:51:00 +1000 Subject: [PATCH 037/150] Fix typo in test name --- tests/site_downloaders/test_download_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index f02e9f7..f89df8a 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -69,6 +69,6 @@ def test_factory_lever_bad(test_url: str): ('https://youtube.com/watch?v=Gv8Wz74FjVA', 'youtube.com/watch'), ('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'), )) -def test_sanitise_urll(test_url: str, expected: str): +def test_sanitise_url(test_url: str, expected: str): result = DownloadFactory._sanitise_url(test_url) assert result == expected From f47688812d1755b58105cd5400205e3f75410441 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:51:24 +1000 Subject: [PATCH 038/150] Rename function --- bdfr/site_downloaders/download_factory.py | 4 ++-- tests/site_downloaders/test_download_factory.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 7035dc2..8eff2b8 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -21,7 +21,7 @@ from bdfr.site_downloaders.youtube import Youtube class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: - sanitised_url = DownloadFactory._sanitise_url(url) + sanitised_url = DownloadFactory.sanitise_url(url) if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): return Imgur elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url): @@ -49,7 +49,7 @@ class DownloadFactory: f'No downloader module exists for url {url}') @staticmethod - def _sanitise_url(url: str) -> str: + def sanitise_url(url: str) -> str: beginning_regex = re.compile(r'\s*(www\.?)?') split_url = urllib.parse.urlsplit(url) split_url = split_url.netloc + split_url.path diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index f89df8a..d5e84d8 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -70,5 +70,5 @@ def test_factory_lever_bad(test_url: str): ('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'), )) def test_sanitise_url(test_url: str, expected: str): - result = DownloadFactory._sanitise_url(test_url) + result = DownloadFactory.sanitise_url(test_url) assert result == expected From 87959028e5a0ffffe545b821660fa53e6f50dfe0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:59:32 +1000 Subject: [PATCH 039/150] Add blacklist for web filetypes --- bdfr/site_downloaders/download_factory.py | 20 ++++++++++++++++++- .../site_downloaders/test_download_factory.py | 13 ++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 8eff2b8..cbfee2d 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -24,7 +24,8 @@ class DownloadFactory: sanitised_url = DownloadFactory.sanitise_url(url) if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): return Imgur - elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url): + elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ + not DownloadFactory.is_web_resource(sanitised_url): return Direct elif re.match(r'erome\.com.*', sanitised_url): return Erome @@ -55,3 +56,20 @@ class DownloadFactory: split_url = split_url.netloc + split_url.path split_url = re.sub(beginning_regex, '', split_url) return split_url + + @staticmethod + def is_web_resource(url: str) -> bool: + web_extensions = ( + 'asp', + 'cfm', + 'cfml', + 'css', + 'html', + 'js', + 'php', + 'xhtml', + ) + if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url): + return True + else: + return False diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index d5e84d8..4b5356c 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -72,3 +72,16 @@ def test_factory_lever_bad(test_url: str): def test_sanitise_url(test_url: str, expected: str): result = DownloadFactory.sanitise_url(test_url) assert result == expected + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('www.example.com/test.asp', True), + ('www.example.com/test.html', True), + ('www.example.com/test.js', True), + ('www.example.com/test.xhtml', True), + ('www.example.com/test.mp4', False), + ('www.example.com/test.png', False), +)) +def test_is_web_resource(test_url: str, expected: bool): + result = DownloadFactory.is_web_resource(test_url) + assert result == expected From fef2fc864bb75f601253a590d8728cce89cd89db Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 19:33:32 +1000 Subject: [PATCH 040/150] Update blacklist --- bdfr/site_downloaders/download_factory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index cbfee2d..41813f9 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -61,12 +61,15 @@ class DownloadFactory: def is_web_resource(url: str) -> bool: web_extensions = ( 'asp', + 'aspx', 'cfm', 'cfml', 'css', + 'htm', 'html', 'js', 'php', + 'php3', 'xhtml', ) if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url): From 80bb4a8b5eaa5a7603116bb9410748655d029b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 25 May 2021 13:58:30 +0300 Subject: [PATCH 041/150] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index ab4e1ab..efc9757 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -9,7 +9,7 @@ assignees: '' - [ ] I am reporting a bug. - [ ] I am running the latest version of BDfR -- [ ] I have read the [Opening an issue](../../README.md#configuration) +- [ ] I have read the [Opening an issue](https://github.com/aliparlakci/bulk-downloader-for-reddit/blob/master/docs/CONTRIBUTING.md#opening-an-issue) ## Description A clear and concise description of what the bug is. From 6b78a23484d78bb780a85370007bc98b09ff173a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 27 May 2021 15:22:58 +1000 Subject: [PATCH 042/150] Allow --user to be specified multiple times --- bdfr/__main__.py | 2 +- bdfr/archiver.py | 6 +++--- bdfr/configuration.py | 2 +- bdfr/connector.py | 41 ++++++++++++++++++++++----------------- tests/test_connector.py | 9 ++++----- tests/test_integration.py | 4 ++++ 6 files changed, 36 insertions(+), 28 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 28ef207..cf039a5 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -26,7 +26,7 @@ _common_options = [ click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), click.option('--time-format', type=str, default=None), - click.option('-u', '--user', type=str, default=None), + click.option('-u', '--user', type=str, multiple=True, default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')), default=None), diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 3e0b907..b19a042 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -14,7 +14,6 @@ from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry from bdfr.configuration import Configuration -from bdfr.downloader import RedditDownloader from bdfr.connector import RedditConnector from bdfr.exceptions import ArchiverError from bdfr.resource import Resource @@ -47,8 +46,9 @@ class Archiver(RedditConnector): results = super(Archiver, self).get_user_data() if self.args.user and self.args.all_comments: sort = self.determine_sort_function() - logger.debug(f'Retrieving comments of user {self.args.user}') - results.append(sort(self.reddit_instance.redditor(self.args.user).comments, limit=self.args.limit)) + for user in self.args.user: + logger.debug(f'Retrieving comments of user {user}') + results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit)) return results @staticmethod diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 9ab9d45..446bc82 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -35,7 +35,7 @@ class Configuration(Namespace): self.time: str = 'all' self.time_format = None self.upvoted: bool = False - self.user: Optional[str] = None + self.user: list[str] = [] self.verbose: int = 0 self.make_hard_links = False diff --git a/bdfr/connector.py b/bdfr/connector.py index c20b749..6aec2f5 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -74,7 +74,7 @@ class RedditConnector(metaclass=ABCMeta): logger.log(9, 'Create file name formatter') self.create_reddit_instance() - self.resolve_user_name() + self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user])) self.excluded_submission_ids = self.read_excluded_ids() @@ -256,14 +256,16 @@ class RedditConnector(metaclass=ABCMeta): else: return [] - def resolve_user_name(self): - if self.args.user == 'me': + def resolve_user_name(self, in_name: str) -> str: + if in_name == 'me': if self.authenticated: - self.args.user = self.reddit_instance.user.me().name - logger.log(9, f'Resolved user to {self.args.user}') + resolved_name = self.reddit_instance.user.me().name + logger.log(9, f'Resolved user to {resolved_name}') + return resolved_name else: - self.args.user = None logger.warning('To use "me" as a user, an authenticated Reddit instance must be used') + else: + return in_name def get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] @@ -289,10 +291,13 @@ class RedditConnector(metaclass=ABCMeta): def get_multireddits(self) -> list[Iterator]: if self.args.multireddit: + if len(self.args.user) != 1: + logger.error(f'Only 1 user can be supplied when retrieving from multireddits') + return [] out = [] for multi in self.split_args_input(self.args.multireddit): try: - multi = self.reddit_instance.multireddit(self.args.user, multi) + multi = self.reddit_instance.multireddit(self.args.user[0], multi) if not multi.subreddits: raise errors.BulkDownloaderException out.append(self.create_filtered_listing_generator(multi)) @@ -312,31 +317,31 @@ class RedditConnector(metaclass=ABCMeta): def get_user_data(self) -> list[Iterator]: if any([self.args.submitted, self.args.upvoted, self.args.saved]): - if self.args.user: + if not self.args.user: + logger.warning('At least one user must be supplied to download user data') + return [] + generators = [] + for user in self.args.user: try: - self.check_user_existence(self.args.user) + self.check_user_existence(user) except errors.BulkDownloaderException as e: logger.error(e) - return [] - generators = [] + continue if self.args.submitted: logger.debug(f'Retrieving submitted posts of user {self.args.user}') generators.append(self.create_filtered_listing_generator( - self.reddit_instance.redditor(self.args.user).submissions, + self.reddit_instance.redditor(user).submissions, )) if not self.authenticated and any((self.args.upvoted, self.args.saved)): logger.warning('Accessing user lists requires authentication') else: if self.args.upvoted: logger.debug(f'Retrieving upvoted posts of user {self.args.user}') - generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit)) + generators.append(self.reddit_instance.redditor(user).upvoted(limit=self.args.limit)) if self.args.saved: logger.debug(f'Retrieving saved posts of user {self.args.user}') - generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) - return generators - else: - logger.warning('A user must be supplied to download user data') - return [] + generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit)) + return generators else: return [] diff --git a/tests/test_connector.py b/tests/test_connector.py index 1078707..03d2668 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -34,7 +34,7 @@ def downloader_mock(args: Configuration): return downloader_mock -def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): +def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]) -> list: results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) if result_limit is not None: @@ -232,7 +232,7 @@ def test_get_multireddits_public( downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.limit = limit downloader_mock.args.multireddit = test_multireddits - downloader_mock.args.user = test_user + downloader_mock.args.user = [test_user] downloader_mock.reddit_instance = reddit_instance downloader_mock.create_filtered_listing_generator.return_value = \ RedditConnector.create_filtered_listing_generator( @@ -257,7 +257,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.submitted = True - downloader_mock.args.user = test_user + downloader_mock.args.user = [test_user] downloader_mock.authenticated = False downloader_mock.reddit_instance = reddit_instance downloader_mock.create_filtered_listing_generator.return_value = \ @@ -284,11 +284,10 @@ def test_get_user_authenticated_lists( ): downloader_mock.args.__dict__[test_flag] = True downloader_mock.reddit_instance = authenticated_reddit_instance - downloader_mock.args.user = 'me' downloader_mock.args.limit = 10 downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT - RedditConnector.resolve_user_name(downloader_mock) + downloader_mock.args.user = [RedditConnector.resolve_user_name(downloader_mock, 'me')] results = RedditConnector.get_user_data(downloader_mock) assert_all_results_are_submissions(10, results) diff --git a/tests/test_integration.py b/tests/test_integration.py index 7aec0eb..2ff1909 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -117,6 +117,7 @@ def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Pa @pytest.mark.authenticated @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( + ['--user', 'djnish', '--submitted', '--user', 'FriesWithThat', '-L', 10], ['--user', 'me', '--upvoted', '--authenticate', '-L', 10], ['--user', 'me', '--saved', '--authenticate', '-L', 10], ['--user', 'me', '--submitted', '--authenticate', '-L', 10], @@ -231,6 +232,7 @@ def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], + ['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'], )) def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -265,12 +267,14 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): ['--user', 'sdclhgsolgjeroij', '--upvoted', '-L', 10], ['--subreddit', 'submitters', '-L', 10], # Private subreddit ['--subreddit', 'donaldtrump', '-L', 10], # Banned subreddit + ['--user', 'djnish', '--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], )) def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 + assert 'Downloaded' not in result.output @pytest.mark.online From be00bfb1bbcbf46a72a124f086cd7a9f48fd9b78 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 27 May 2021 15:24:18 +1000 Subject: [PATCH 043/150] Update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9f32599..97dcc97 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,8 @@ The following options are common between both the `archive` and `download` comma - `-u, --user` - This specifies the user to scrape in concert with other options - When using `--authenticate`, `--user me` can be used to refer to the authenticated user + - Can be specified multiple times for multiple users + - If downloading a multireddit, only one user can be specified - `-v, --verbose` - Increases the verbosity of the program - Can be specified multiple times From 9a1e1ebea1750923a5e558326d5f192ae8adbe6f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 18 May 2021 12:39:08 +1000 Subject: [PATCH 044/150] Add path limit fix --- bdfr/file_name_formatter.py | 27 ++++++++++++++---- tests/test_file_name_formatter.py | 47 +++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index c6c13c2..2fbf95f 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -4,6 +4,7 @@ import datetime import logging import platform import re +import subprocess from pathlib import Path from typing import Optional @@ -104,32 +105,46 @@ class FileNameFormatter: ) -> Path: subfolder = Path( destination_directory, - *[self._format_name(resource.source_submission, part) for part in self.directory_format_string] + *[self._format_name(resource.source_submission, part) for part in self.directory_format_string], ) index = f'_{str(index)}' if index else '' if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') ending = index + resource.extension file_name = str(self._format_name(resource.source_submission, self.file_format_string)) - file_name = self._limit_file_name_length(file_name, ending) try: - file_path = Path(subfolder, file_name) + file_path = self._limit_file_name_length(file_name, ending, subfolder) except TypeError: raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}') return file_path @staticmethod - def _limit_file_name_length(filename: str, ending: str) -> str: + def _limit_file_name_length(filename: str, ending: str, root: Path) -> Path: + root = root.resolve().expanduser() possible_id = re.search(r'((?:_\w{6})?$)', filename) if possible_id: ending = possible_id.group(1) + ending filename = filename[:possible_id.start()] + max_path = FileNameFormatter.find_max_path_length() max_length_chars = 255 - len(ending) max_length_bytes = 255 - len(ending.encode('utf-8')) - while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes: + max_path_length = max_path - len(ending) - len(str(root)) - 1 + while len(filename) > max_length_chars or \ + len(filename.encode('utf-8')) > max_length_bytes or \ + len(filename) > max_path_length: filename = filename[:-1] - return filename + ending + return Path(root, filename + ending) + + @staticmethod + def find_max_path_length() -> int: + try: + return int(subprocess.check_output(['getconf', 'PATH_MAX', '/'])) + except (ValueError, subprocess.CalledProcessError, OSError): + if platform.system() == 'Windows': + return 260 + else: + return 4096 def format_resource_paths( self, diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index b1faf86..cc13e0c 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 # coding=utf-8 +import platform +import unittest.mock from datetime import datetime from pathlib import Path from typing import Optional from unittest.mock import MagicMock -import platform import praw.models import pytest @@ -28,10 +29,10 @@ def submission() -> MagicMock: return test -def do_test_string_equality(result: str, expected: str) -> bool: +def do_test_string_equality(result: [Path, str], expected: str) -> bool: if platform.system() == 'Windows': expected = FileNameFormatter._format_for_windows(expected) - return expected == result + return str(result).endswith(expected) def do_test_path_equality(result: Path, expected: str) -> bool: @@ -41,7 +42,7 @@ def do_test_path_equality(result: Path, expected: str) -> bool: expected = Path(*expected) else: expected = Path(expected) - return result == expected + return str(result).endswith(str(expected)) @pytest.fixture(scope='session') @@ -173,7 +174,9 @@ def test_format_multiple_resources(): test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') results = test_formatter.format_resource_paths(mocks, Path('.')) results = set([str(res[0]) for res in results]) - assert results == {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'} + expected = set([str(Path(Path('.'), name).resolve()) + for name in ('test_1.png', 'test_2.png', 'test_3.png', 'test_4.png')]) + assert results == expected @pytest.mark.parametrize(('test_filename', 'test_ending'), ( @@ -183,10 +186,11 @@ def test_format_multiple_resources(): ('πŸ˜πŸ’•βœ¨' * 100, '_1.png'), )) def test_limit_filename_length(test_filename: str, test_ending: str): - result = FileNameFormatter._limit_file_name_length(test_filename, test_ending) - assert len(result) <= 255 - assert len(result.encode('utf-8')) <= 255 - assert isinstance(result, str) + result = FileNameFormatter._limit_file_name_length(test_filename, test_ending, Path('.')) + assert len(result.name) <= 255 + assert len(result.name.encode('utf-8')) <= 255 + assert len(str(result)) <= FileNameFormatter.find_max_path_length() + assert isinstance(result, Path) @pytest.mark.parametrize(('test_filename', 'test_ending', 'expected_end'), ( @@ -201,11 +205,11 @@ def test_limit_filename_length(test_filename: str, test_ending: str): ('πŸ˜πŸ’•βœ¨' * 100 + '_aaa1aa', '_1.png', '_aaa1aa_1.png'), )) def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str): - result = FileNameFormatter._limit_file_name_length(test_filename, test_ending) - assert len(result) <= 255 - assert len(result.encode('utf-8')) <= 255 - assert isinstance(result, str) - assert result.endswith(expected_end) + result = FileNameFormatter._limit_file_name_length(test_filename, test_ending, Path('.')) + assert len(result.name) <= 255 + assert len(result.name.encode('utf-8')) <= 255 + assert result.name.endswith(expected_end) + assert len(str(result)) <= FileNameFormatter.find_max_path_length() def test_shorten_filenames(submission: MagicMock, tmp_path: Path): @@ -295,7 +299,7 @@ def test_format_archive_entry_comment( test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') test_entry = Resource(test_comment, '', '.json') result = test_formatter.format_path(test_entry, tmp_path) - assert do_test_string_equality(result.name, expected_name) + assert do_test_string_equality(result, expected_name) @pytest.mark.parametrize(('test_folder_scheme', 'expected'), ( @@ -364,3 +368,16 @@ def test_time_string_formats(test_time_format: str, expected: str): test_formatter = FileNameFormatter('{TITLE}', '', test_time_format) result = test_formatter._convert_timestamp(test_time.timestamp()) assert result == expected + + +def test_get_max_path_length(): + result = FileNameFormatter.find_max_path_length() + assert result in (4096, 260) + + +def test_windows_max_path(): + with unittest.mock.patch('platform.system', return_value='Windows'): + with unittest.mock.patch('bdfr.file_name_formatter.FileNameFormatter.find_max_path_length', return_value=260): + result = FileNameFormatter._limit_file_name_length('test' * 50, '_1.png', Path('test' * 25)) + assert len(str(result)) <= 260 + assert len(result.name) <= 75 From c89de29f72f8768c9145668ceb975fd198bc0937 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 18 May 2021 12:43:39 +1000 Subject: [PATCH 045/150] Update test condition --- tests/test_file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index cc13e0c..8b0c462 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -380,4 +380,4 @@ def test_windows_max_path(): with unittest.mock.patch('bdfr.file_name_formatter.FileNameFormatter.find_max_path_length', return_value=260): result = FileNameFormatter._limit_file_name_length('test' * 50, '_1.png', Path('test' * 25)) assert len(str(result)) <= 260 - assert len(result.name) <= 75 + assert len(result.name) <= 150 From bf50618590bee214546e8b45c8f2bc39edec8708 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 18 May 2021 12:46:55 +1000 Subject: [PATCH 046/150] Add macOS value to test --- tests/test_file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 8b0c462..efd1047 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -372,7 +372,7 @@ def test_time_string_formats(test_time_format: str, expected: str): def test_get_max_path_length(): result = FileNameFormatter.find_max_path_length() - assert result in (4096, 260) + assert result in (4096, 260, 1024) def test_windows_max_path(): From 12a508c8987c1759a3f1e27fb4228a4de14e8946 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 27 May 2021 15:29:43 +1000 Subject: [PATCH 047/150] Update failing test --- tests/test_file_name_formatter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index efd1047..1f5d22f 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -375,9 +375,9 @@ def test_get_max_path_length(): assert result in (4096, 260, 1024) -def test_windows_max_path(): +def test_windows_max_path(tmp_path: Path): with unittest.mock.patch('platform.system', return_value='Windows'): with unittest.mock.patch('bdfr.file_name_formatter.FileNameFormatter.find_max_path_length', return_value=260): - result = FileNameFormatter._limit_file_name_length('test' * 50, '_1.png', Path('test' * 25)) + result = FileNameFormatter._limit_file_name_length('test' * 100, '_1.png', tmp_path) assert len(str(result)) <= 260 - assert len(result.name) <= 150 + assert len(result.name) <= (260 - len(str(tmp_path))) From 6caa02adb173e6f0fac34f252e85cc7ce695d252 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 27 May 2021 15:58:40 +1000 Subject: [PATCH 048/150] Update failing test --- tests/test_file_name_formatter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 1f5d22f..e4c82ac 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -173,9 +173,8 @@ def test_format_multiple_resources(): mocks.append(new_mock) test_formatter = FileNameFormatter('{TITLE}', '', 'ISO') results = test_formatter.format_resource_paths(mocks, Path('.')) - results = set([str(res[0]) for res in results]) - expected = set([str(Path(Path('.'), name).resolve()) - for name in ('test_1.png', 'test_2.png', 'test_3.png', 'test_4.png')]) + results = set([str(res[0].name) for res in results]) + expected = {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'} assert results == expected From 79fba4ac4a06e113c173294315a3e1a25581af6b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 31 May 2021 13:42:03 +1000 Subject: [PATCH 049/150] Fix indent --- bdfr/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index a262dae..6733691 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -105,7 +105,7 @@ class RedditDownloader(RedditConnector): os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination logger.debug(f'Hash added to master list: {resource_hash}') - logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') + logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') @staticmethod def scan_existing_files(directory: Path) -> dict[str, Path]: From 434aeb8feba7627ac2763bbc162624585a150571 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sun, 6 Jun 2021 20:29:09 +1000 Subject: [PATCH 050/150] Add a combined command for the archiver and downloader: `clone` (#433) * Simplify downloader function * Add basic scraper class * Add "scrape" command * Rename "scrape" command to "clone" * Add integration tests for clone command * Update README * Fix failing test --- README.md | 14 +++++++- bdfr/__main__.py | 70 ++++++++++++++++++++++++++++----------- bdfr/cloner.py | 21 ++++++++++++ bdfr/downloader.py | 19 ++++++----- tests/test_downloader.py | 22 +++++++++--- tests/test_integration.py | 27 +++++++++++++++ 6 files changed, 139 insertions(+), 34 deletions(-) create mode 100644 bdfr/cloner.py diff --git a/README.md b/README.md index 97dcc97..fd5a3b7 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,24 @@ If you want to use the source code or make contributions, refer to [CONTRIBUTING The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. -There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. +There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous commands at once and is more efficient than running those commands sequentially. + +Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used. After installation, run the program from any directory as shown below: + ```bash python3 -m bdfr download ``` + ```bash python3 -m bdfr archive ``` +```bash +python3 -m bdfr clone +``` + However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are: ```bash @@ -184,6 +192,10 @@ The following options are for the `archive` command specifically. - `xml` - `yaml` +### Cloner Options + +The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both. + ## Authentication and Security The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. diff --git a/bdfr/__main__.py b/bdfr/__main__.py index cf039a5..0d299c9 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -8,6 +8,7 @@ import click from bdfr.archiver import Archiver from bdfr.configuration import Configuration from bdfr.downloader import RedditDownloader +from bdfr.cloner import RedditCloner logger = logging.getLogger() @@ -32,11 +33,32 @@ _common_options = [ 'controversial', 'rising', 'relevance')), default=None), ] +_downloader_options = [ + click.option('--file-scheme', default=None, type=str), + click.option('--folder-scheme', default=None, type=str), + click.option('--make-hard-links', is_flag=True, default=None), + click.option('--max-wait-time', type=int, default=None), + click.option('--no-dupes', is_flag=True, default=None), + click.option('--search-existing', is_flag=True, default=None), + click.option('--exclude-id', default=None, multiple=True), + click.option('--exclude-id-file', default=None, multiple=True), + click.option('--skip', default=None, multiple=True), + click.option('--skip-domain', default=None, multiple=True), + click.option('--skip-subreddit', default=None, multiple=True), +] -def _add_common_options(func): - for opt in _common_options: - func = opt(func) - return func +_archiver_options = [ + click.option('--all-comments', is_flag=True, default=None), + click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), +] + + +def _add_options(opts: list): + def wrap(func): + for opt in opts: + func = opt(func) + return func + return wrap @click.group() @@ -45,18 +67,8 @@ def cli(): @cli.command('download') -@click.option('--file-scheme', default=None, type=str) -@click.option('--folder-scheme', default=None, type=str) -@click.option('--make-hard-links', is_flag=True, default=None) -@click.option('--max-wait-time', type=int, default=None) -@click.option('--no-dupes', is_flag=True, default=None) -@click.option('--search-existing', is_flag=True, default=None) -@click.option('--exclude-id', default=None, multiple=True) -@click.option('--exclude-id-file', default=None, multiple=True) -@click.option('--skip', default=None, multiple=True) -@click.option('--skip-domain', default=None, multiple=True) -@click.option('--skip-subreddit', default=None, multiple=True) -@_add_common_options +@_add_options(_common_options) +@_add_options(_downloader_options) @click.pass_context def cli_download(context: click.Context, **_): config = Configuration() @@ -73,9 +85,8 @@ def cli_download(context: click.Context, **_): @cli.command('archive') -@_add_common_options -@click.option('--all-comments', is_flag=True, default=None) -@click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) +@_add_options(_common_options) +@_add_options(_archiver_options) @click.pass_context def cli_archive(context: click.Context, **_): config = Configuration() @@ -85,7 +96,26 @@ def cli_archive(context: click.Context, **_): reddit_archiver = Archiver(config) reddit_archiver.download() except Exception: - logger.exception('Downloader exited unexpectedly') + logger.exception('Archiver exited unexpectedly') + raise + else: + logger.info('Program complete') + + +@cli.command('clone') +@_add_options(_common_options) +@_add_options(_archiver_options) +@_add_options(_downloader_options) +@click.pass_context +def cli_clone(context: click.Context, **_): + config = Configuration() + config.process_click_arguments(context) + setup_logging(config.verbose) + try: + reddit_scraper = RedditCloner(config) + reddit_scraper.download() + except Exception: + logger.exception('Scraper exited unexpectedly') raise else: logger.info('Program complete') diff --git a/bdfr/cloner.py b/bdfr/cloner.py new file mode 100644 index 0000000..979f50f --- /dev/null +++ b/bdfr/cloner.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +from bdfr.archiver import Archiver +from bdfr.configuration import Configuration +from bdfr.downloader import RedditDownloader + +logger = logging.getLogger(__name__) + + +class RedditCloner(RedditDownloader, Archiver): + def __init__(self, args: Configuration): + super(RedditCloner, self).__init__(args) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + self._download_submission(submission) + self.write_entry(submission) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 6733691..3b2c581 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -42,19 +42,20 @@ class RedditDownloader(RedditConnector): def download(self): for generator in self.reddit_lists: for submission in generator: - if submission.id in self.excluded_submission_ids: - logger.debug(f'Object {submission.id} in exclusion list, skipping') - continue - elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: - logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') - else: - logger.debug(f'Attempting to download submission {submission.id}') - self._download_submission(submission) + self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): - if not isinstance(submission, praw.models.Submission): + if submission.id in self.excluded_submission_ids: + logger.debug(f'Object {submission.id} in exclusion list, skipping') + return + elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: + logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') + return + elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return + + logger.debug(f'Attempting to download submission {submission.id}') try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index b4f175d..d67aee6 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -4,11 +4,12 @@ import os import re from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import praw.models import pytest +import bdfr.site_downloaders.download_factory from bdfr.__main__ import setup_logging from bdfr.configuration import Configuration from bdfr.connector import RedditConnector @@ -37,17 +38,30 @@ def downloader_mock(args: Configuration): (('aaaaaa',), ('aaaaaa',), 0), ((), ('aaaaaa',), 0), (('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1), + (('aaaaaa', 'bbbbbb', 'cccccc'), ('aaaaaa',), 2), )) -def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock): +@patch('bdfr.site_downloaders.download_factory.DownloadFactory.pull_lever') +def test_excluded_ids( + mock_function: MagicMock, + test_ids: tuple[str], + test_excluded: tuple[str], + expected_len: int, + downloader_mock: MagicMock, +): downloader_mock.excluded_submission_ids = test_excluded + mock_function.return_value = MagicMock() + mock_function.return_value.__name__ = 'test' test_submissions = [] for test_id in test_ids: m = MagicMock() m.id = test_id + m.subreddit.display_name.return_value = 'https://www.example.com/' + m.__class__ = praw.models.Submission test_submissions.append(m) downloader_mock.reddit_lists = [test_submissions] - RedditDownloader.download(downloader_mock) - assert downloader_mock._download_submission.call_count == expected_len + for submission in test_submissions: + RedditDownloader._download_submission(downloader_mock, submission) + assert mock_function.call_count == expected_len @pytest.mark.online diff --git a/tests/test_integration.py b/tests/test_integration.py index 2ff1909..ed67f03 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -33,6 +33,17 @@ def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): return out +def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): + out = [ + 'clone', + str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args + return out + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @@ -343,3 +354,19 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Some files might not be downloaded due to name conflicts' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g'], + ['-s', 'TrollXChromosomes/', '-L', 1], +)) +def test_cli_scrape_general(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output + assert 'Record for entry item' in result.output From 6dcef83666c2c5491ea984b776be2363e140211b Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sun, 6 Jun 2021 20:47:56 +1000 Subject: [PATCH 051/150] Add ability to disable modules (#434) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix test name to match standard * Rename file * Add ability to disable modules * Update README * Fix missing comma * Fix more missing commas. sigh... Co-authored-by: Ali ParlakΓ§Δ± --- README.md | 21 +++++++++++++++++++ bdfr/__main__.py | 17 ++++++++------- bdfr/configuration.py | 7 ++++--- bdfr/connector.py | 11 ++++++++++ bdfr/downloader.py | 4 +++- ...fallback.py => test_youtubedl_fallback.py} | 0 tests/test_connector.py | 4 +++- tests/test_integration.py | 19 ++++++++++++++++- 8 files changed, 69 insertions(+), 14 deletions(-) rename tests/site_downloaders/fallback_downloaders/{youtubedl_fallback.py => test_youtubedl_fallback.py} (100%) diff --git a/README.md b/README.md index fd5a3b7..405cf84 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,10 @@ The following options are common between both the `archive` and `download` comma - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - See [Configuration Files](#configuration) for more details +- `--disable-module` + - Can be specified multiple times + - Disables certain modules from being used + - See [Disabling Modules](#disabling-modules) for more information and a list of module names - `--log` - This allows one to specify the location of the logfile - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below @@ -266,6 +270,7 @@ The following keys are optional, and defaults will be used if they cannot be fou - `backup_log_count` - `max_wait_time` - `time_format` + - `disabled_modules` All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. @@ -277,6 +282,22 @@ The option `time_format` will specify the format of the timestamp that replaces The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library. +#### Disabling Modules + +The individual modules of the BDFR, used to download submissions from websites, can be disabled. This is helpful especially in the case of the fallback downloaders, since the `--skip-domain` option cannot be effectively used in these cases. For example, the Youtube-DL downloader can retrieve data from hundreds of websites and domains; thus the only way to fully disable it is via the `--disable-module` option. + +Modules can be disabled through the command line interface for the BDFR or more permanently in the configuration file via the `disabled_modules` option. The list of downloaders that can be disabled are the following. Note that they are case-insensitive. + +- `Direct` +- `Erome` +- `Gallery` (Reddit Image Galleries) +- `Gfycat` +- `Imgur` +- `Redgifs` +- `SelfPost` (Reddit Text Post) +- `Youtube` +- `YoutubeDlFallback` + ### Rate Limiting The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases. diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 0d299c9..1103581 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -14,19 +14,20 @@ logger = logging.getLogger() _common_options = [ click.argument('directory', type=str), - click.option('--config', type=str, default=None), - click.option('-v', '--verbose', default=None, count=True), - click.option('-l', '--link', multiple=True, default=None, type=str), - click.option('-s', '--subreddit', multiple=True, default=None, type=str), - click.option('-m', '--multireddit', multiple=True, default=None, type=str), - click.option('-L', '--limit', default=None, type=int), click.option('--authenticate', is_flag=True, default=None), + click.option('--config', type=str, default=None), + click.option('--disable-module', multiple=True, default=None, type=str), click.option('--log', type=str, default=None), - click.option('--submitted', is_flag=True, default=None), - click.option('--upvoted', is_flag=True, default=None), click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), + click.option('--submitted', is_flag=True, default=None), click.option('--time-format', type=str, default=None), + click.option('--upvoted', is_flag=True, default=None), + click.option('-L', '--limit', default=None, type=int), + click.option('-l', '--link', multiple=True, default=None, type=str), + click.option('-m', '--multireddit', multiple=True, default=None, type=str), + click.option('-s', '--subreddit', multiple=True, default=None, type=str), + click.option('-v', '--verbose', default=None, count=True), click.option('-u', '--user', type=str, multiple=True, default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 446bc82..327a453 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -13,19 +13,21 @@ class Configuration(Namespace): self.authenticate = False self.config = None self.directory: str = '.' + self.disable_module: list[str] = [] self.exclude_id = [] self.exclude_id_file = [] + self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' + self.folder_scheme: str = '{SUBREDDIT}' self.limit: Optional[int] = None self.link: list[str] = [] self.log: Optional[str] = None + self.make_hard_links = False self.max_wait_time = None self.multireddit: list[str] = [] self.no_dupes: bool = False self.saved: bool = False self.search: Optional[str] = None self.search_existing: bool = False - self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' - self.folder_scheme: str = '{SUBREDDIT}' self.skip: list[str] = [] self.skip_domain: list[str] = [] self.skip_subreddit: list[str] = [] @@ -37,7 +39,6 @@ class Configuration(Namespace): self.upvoted: bool = False self.user: list[str] = [] self.verbose: int = 0 - self.make_hard_links = False # Archiver-specific options self.format = 'json' diff --git a/bdfr/connector.py b/bdfr/connector.py index 6aec2f5..68efc0c 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -64,6 +64,8 @@ class RedditConnector(metaclass=ABCMeta): self.read_config() + self.parse_disabled_modules() + self.download_filter = self.create_download_filter() logger.log(9, 'Created download filter') self.time_filter = self.create_time_filter() @@ -99,10 +101,19 @@ class RedditConnector(metaclass=ABCMeta): option = 'ISO' logger.debug(f'Setting datetime format string to {option}') self.args.time_format = option + if not self.args.disable_module: + self.args.disable_module = [self.cfg_parser.get('DEFAULT', 'disabled_modules', fallback='')] # Update config on disk with open(self.config_location, 'w') as file: self.cfg_parser.write(file) + def parse_disabled_modules(self): + disabled_modules = self.args.disable_module + disabled_modules = self.split_args_input(disabled_modules) + disabled_modules = set([name.strip().lower() for name in disabled_modules]) + self.args.disable_module = disabled_modules + logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}') + def create_reddit_instance(self): if self.args.authenticate: logger.debug('Using authenticated Reddit instance') diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 3b2c581..61158a3 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -63,7 +63,9 @@ class RedditDownloader(RedditConnector): except errors.NotADownloadableLinkError as e: logger.error(f'Could not download submission {submission.id}: {e}') return - + if downloader_class.__name__.lower() in self.args.disable_module: + logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}') + return try: content = downloader.find_resources(self.authenticator) except errors.SiteDownloaderError as e: diff --git a/tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py similarity index 100% rename from tests/site_downloaders/fallback_downloaders/youtubedl_fallback.py rename to tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py diff --git a/tests/test_connector.py b/tests/test_connector.py index 03d2668..2249b96 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -313,7 +313,9 @@ def test_sanitise_subreddit_name(test_name: str, expected: str): (['test1,test2', 'test3'], {'test1', 'test2', 'test3'}), (['test1, test2', 'test3'], {'test1', 'test2', 'test3'}), (['test1; test2', 'test3'], {'test1', 'test2', 'test3'}), - (['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}) + (['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}), + ([''], {''}), + (['test'], {'test'}), )) def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): results = RedditConnector.split_args_input(test_subreddit_entries) diff --git a/tests/test_integration.py b/tests/test_integration.py index ed67f03..6a9e52b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -348,7 +348,7 @@ def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): ['--file-scheme', '{TITLE}'], ['--file-scheme', '{TITLE}_test_{SUBREDDIT}'], )) -def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): +def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) @@ -356,6 +356,23 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): assert 'Some files might not be downloaded due to name conflicts' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--disable-module', 'Direct'], + ['-l', 'nnb9vs', '--disable-module', 'YoutubeDlFallback'], + ['-l', 'nnb9vs', '--disable-module', 'youtubedlfallback'], +)) +def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'skipped due to disabled module' in result.output + assert 'Downloaded submission' not in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') From 19e97174f3784090fb725829b7fd6dbae8691eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 6 Jun 2021 14:04:34 +0300 Subject: [PATCH 052/150] Bump the version to v2.2.0 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b1345d9..2969fe0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.1.1 +version = 2.2.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 8be3efb6e4adff06e77945d2a1af6f23fed5ad63 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 8 Jun 2021 13:08:39 +1000 Subject: [PATCH 053/150] Fix bug with Imgur gifs being shortened too much The rstrip function was used wrongly, it doesn't remove a substring but rather removes any of the characters provided, so here it removed any I, G, V, or F that finished the six character ID for Imgur, resulting in a 404 error for the resources in question. --- bdfr/site_downloaders/imgur.py | 2 +- tests/site_downloaders/test_imgur.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 3d071d4..bd974be 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -39,7 +39,7 @@ class Imgur(BaseDownloader): def _get_data(link: str) -> dict: if re.match(r'.*\.gifv$', link): link = link.replace('i.imgur', 'imgur') - link = link.rstrip('.gifv') + link = re.sub('\\.gifv$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 792926a..0e557ed 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -130,6 +130,12 @@ def test_imgur_extension_validation_bad(test_extension: str): 'fb6c913d721c0bbb96aa65d7f560d385', ), ), + ( + 'https://i.imgur.com/lFJai6i.gifv', + ( + '01a6e79a30bec0e644e5da12365d5071', + ), + ) )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 8ba2d0bb555d059acd216222cb16c5d76d0d3942 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 10 Jun 2021 18:59:22 +1000 Subject: [PATCH 054/150] Add missing return statement --- bdfr/downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 61158a3..a0d8834 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -104,6 +104,7 @@ class RedditDownloader(RedditConnector): except OSError as e: logger.exception(e) logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') + return creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination From 6eeadc88214bf3b5aff8c893ac6a338b38d26187 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 15:31:11 +1000 Subject: [PATCH 055/150] Add option for archiver full context --- bdfr/__main__.py | 1 + bdfr/archiver.py | 3 +++ bdfr/configuration.py | 3 ++- tests/test_integration.py | 14 ++++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 1103581..6312c76 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,6 +50,7 @@ _downloader_options = [ _archiver_options = [ click.option('--all-comments', is_flag=True, default=None), + click.option('--full-context', is_flag=True, default=None), click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), ] diff --git a/bdfr/archiver.py b/bdfr/archiver.py index b19a042..f2870cc 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -61,6 +61,9 @@ class Archiver(RedditConnector): raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): + if self.args.full_context and isinstance(praw_item, praw.models.Comment): + logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}') + praw_item = praw_item.submission archive_entry = self._pull_lever_entry_factory(praw_item) if self.args.format == 'json': self._write_entry_json(archive_entry) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 327a453..558b79f 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -41,8 +41,9 @@ class Configuration(Namespace): self.verbose: int = 0 # Archiver-specific options - self.format = 'json' self.all_comments = False + self.format = 'json' + self.full_context: bool = False def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): diff --git a/tests/test_integration.py b/tests/test_integration.py index 6a9e52b..0a6de3d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -252,6 +252,20 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--full-context', '--link', 'gxqapql'], +)) +def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Converting comment' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow From e500bc4ad4be19dd5d63ca7c7c1f4d7ccc51f5b3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 15:35:12 +1000 Subject: [PATCH 056/150] Update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a06e4af..bf6d4f9 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,9 @@ The following options are for the `archive` command specifically. - `json` (default) - `xml` - `yaml` +- `--full-context` + - This option will, instead of downloading an individual comment, download the submission that comment is a part of + - May result in a longer run time as it retrieves much more data ### Cloner Options From 9fd8b29833fa5f87ebdb6c4786f6489e79f1c297 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 18:36:40 +1000 Subject: [PATCH 057/150] Add another logging message to script --- scripts/extract_failed_ids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 89f1896..104c7af 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -19,4 +19,5 @@ fi grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; + grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From b4ae513e7105b2f17d85227bbe65c17f91677b35 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 11 Jun 2021 18:40:26 +1000 Subject: [PATCH 058/150] Add submodules for bash testing --- .gitmodules | 9 +++++++++ scripts/tests/bats | 1 + scripts/tests/test_helper/bats-assert | 1 + scripts/tests/test_helper/bats-support | 1 + 4 files changed, 12 insertions(+) create mode 100644 .gitmodules create mode 160000 scripts/tests/bats create mode 160000 scripts/tests/test_helper/bats-assert create mode 160000 scripts/tests/test_helper/bats-support diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ea9e094 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "scripts/tests/bats"] + path = scripts/tests/bats + url = https://github.com/bats-core/bats-core.git +[submodule "scripts/tests/test_helper/bats-assert"] + path = scripts/tests/test_helper/bats-assert + url = https://github.com/bats-core/bats-assert.git +[submodule "scripts/tests/test_helper/bats-support"] + path = scripts/tests/test_helper/bats-support + url = https://github.com/bats-core/bats-support.git diff --git a/scripts/tests/bats b/scripts/tests/bats new file mode 160000 index 0000000..ce5ca28 --- /dev/null +++ b/scripts/tests/bats @@ -0,0 +1 @@ +Subproject commit ce5ca2802fabe5dc38393240cd40e20f8928d3b0 diff --git a/scripts/tests/test_helper/bats-assert b/scripts/tests/test_helper/bats-assert new file mode 160000 index 0000000..e0de84e --- /dev/null +++ b/scripts/tests/test_helper/bats-assert @@ -0,0 +1 @@ +Subproject commit e0de84e9c011223e7f88b7ccf1c929f4327097ba diff --git a/scripts/tests/test_helper/bats-support b/scripts/tests/test_helper/bats-support new file mode 160000 index 0000000..d140a65 --- /dev/null +++ b/scripts/tests/test_helper/bats-support @@ -0,0 +1 @@ +Subproject commit d140a65044b2d6810381935ae7f0c94c7023c8c3 From e009fab5047315356a1bf083e847c9de2f4fd6db Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 08:41:38 +1000 Subject: [PATCH 059/150] Add empty files --- scripts/tests/README.md | 0 scripts/tests/test_extract_failed_ids.sh | 0 scripts/tests/test_extract_successful_ids.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 scripts/tests/README.md create mode 100644 scripts/tests/test_extract_failed_ids.sh create mode 100644 scripts/tests/test_extract_successful_ids.sh diff --git a/scripts/tests/README.md b/scripts/tests/README.md new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tests/test_extract_failed_ids.sh b/scripts/tests/test_extract_failed_ids.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tests/test_extract_successful_ids.sh b/scripts/tests/test_extract_successful_ids.sh new file mode 100644 index 0000000..e69de29 From c5c010bce025fe4cf56c1ccfc40c27eb05db2c11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 10:35:31 +1000 Subject: [PATCH 060/150] Rename option --- README.md | 2 +- bdfr/__main__.py | 2 +- bdfr/archiver.py | 2 +- bdfr/configuration.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bf6d4f9..be4f455 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ The following options are for the `archive` command specifically. - `json` (default) - `xml` - `yaml` -- `--full-context` +- `--comment-context` - This option will, instead of downloading an individual comment, download the submission that comment is a part of - May result in a longer run time as it retrieves much more data diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 6312c76..67e4f99 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,7 +50,7 @@ _downloader_options = [ _archiver_options = [ click.option('--all-comments', is_flag=True, default=None), - click.option('--full-context', is_flag=True, default=None), + click.option('--comment-context', is_flag=True, default=None), click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), ] diff --git a/bdfr/archiver.py b/bdfr/archiver.py index f2870cc..74b92e8 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -61,7 +61,7 @@ class Archiver(RedditConnector): raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): - if self.args.full_context and isinstance(praw_item, praw.models.Comment): + if self.args.comment_context and isinstance(praw_item, praw.models.Comment): logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}') praw_item = praw_item.submission archive_entry = self._pull_lever_entry_factory(praw_item) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 558b79f..36a1860 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -43,7 +43,7 @@ class Configuration(Namespace): # Archiver-specific options self.all_comments = False self.format = 'json' - self.full_context: bool = False + self.comment_context: bool = False def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): From a8bc4f999e603f3f5e4555569c65d3cfaeb509eb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 12 Jun 2021 10:41:50 +1000 Subject: [PATCH 061/150] Rename files to proper extension --- .../{test_extract_failed_ids.sh => test_extract_failed_ids.bats} | 0 ...extract_successful_ids.sh => test_extract_successful_ids.bats} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/tests/{test_extract_failed_ids.sh => test_extract_failed_ids.bats} (100%) rename scripts/tests/{test_extract_successful_ids.sh => test_extract_successful_ids.bats} (100%) diff --git a/scripts/tests/test_extract_failed_ids.sh b/scripts/tests/test_extract_failed_ids.bats similarity index 100% rename from scripts/tests/test_extract_failed_ids.sh rename to scripts/tests/test_extract_failed_ids.bats diff --git a/scripts/tests/test_extract_successful_ids.sh b/scripts/tests/test_extract_successful_ids.bats similarity index 100% rename from scripts/tests/test_extract_successful_ids.sh rename to scripts/tests/test_extract_successful_ids.bats From 7c27b7bf127b38b8c8f020ef5a645836b33fc41a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:49:42 +1000 Subject: [PATCH 062/150] Update logging message --- bdfr/downloader.py | 2 +- scripts/extract_failed_ids.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index a0d8834..ab6bf56 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -103,7 +103,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'Written file to {destination}') except OSError as e: logger.exception(e) - logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}') + logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}') return creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 104c7af..f96bd9a 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -11,13 +11,13 @@ if [ -n "$2" ]; then output="$2" echo "Outputting IDs to $output" else - output="failed.txt" + output="./failed.txt" fi { grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; - grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From 72238f39bac5c0b3f5041a479bceb532f3d5fabc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:49:57 +1000 Subject: [PATCH 063/150] Update script --- scripts/extract_successful_ids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 19e8bd7..011ba6c 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -11,7 +11,7 @@ if [ -n "$2" ]; then output="$2" echo "Outputting IDs to $output" else - output="successful.txt" + output="./successful.txt" fi { From 6755d15675b7cebaa98596507ab95fe7f259ab5a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 09:50:41 +1000 Subject: [PATCH 064/150] Add tests for bash scripts --- .../failed_disabled_module.txt | 1 + .../example_logfiles/failed_no_downloader.txt | 3 ++ .../failed_resource_error.txt | 2 + .../failed_sitedownloader_error.txt | 2 + .../example_logfiles/failed_write_error.txt | 1 + .../succeed_already_exists.txt | 3 ++ .../succeed_download_filter.txt | 3 ++ .../succeed_downloaded_submission.txt | 7 +++ .../example_logfiles/succeed_hard_link.txt | 1 + .../succeed_resource_hash.txt | 1 + scripts/tests/test_extract_failed_ids.bats | 43 +++++++++++++++++++ .../tests/test_extract_successful_ids.bats | 38 ++++++++++++++++ 12 files changed, 105 insertions(+) create mode 100644 scripts/tests/example_logfiles/failed_disabled_module.txt create mode 100644 scripts/tests/example_logfiles/failed_no_downloader.txt create mode 100644 scripts/tests/example_logfiles/failed_resource_error.txt create mode 100644 scripts/tests/example_logfiles/failed_sitedownloader_error.txt create mode 100644 scripts/tests/example_logfiles/failed_write_error.txt create mode 100644 scripts/tests/example_logfiles/succeed_already_exists.txt create mode 100644 scripts/tests/example_logfiles/succeed_download_filter.txt create mode 100644 scripts/tests/example_logfiles/succeed_downloaded_submission.txt create mode 100644 scripts/tests/example_logfiles/succeed_hard_link.txt create mode 100644 scripts/tests/example_logfiles/succeed_resource_hash.txt diff --git a/scripts/tests/example_logfiles/failed_disabled_module.txt b/scripts/tests/example_logfiles/failed_disabled_module.txt new file mode 100644 index 0000000..50fd552 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_disabled_module.txt @@ -0,0 +1 @@ +[2021-06-12 12:49:18,452 - bdfr.downloader - DEBUG] - Submission m2601g skipped due to disabled module Direct diff --git a/scripts/tests/example_logfiles/failed_no_downloader.txt b/scripts/tests/example_logfiles/failed_no_downloader.txt new file mode 100644 index 0000000..511d11f --- /dev/null +++ b/scripts/tests/example_logfiles/failed_no_downloader.txt @@ -0,0 +1,3 @@ +[2021-06-12 11:13:35,665 - bdfr.downloader - ERROR] - Could not download submission nxv3ew: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.447961v1?rss=1 +[2021-06-12 11:14:21,958 - bdfr.downloader - ERROR] - Could not download submission nxv3ek: No downloader module exists for url https://alkossegyedit.hu/termek/pluss-macko-poloval-20cm/?feed_id=34832&_unique_id=60c40a1190ccb&utm_source=Reddit&utm_medium=AEAdmin&utm_campaign=Poster +[2021-06-12 11:17:53,456 - bdfr.downloader - ERROR] - Could not download submission nxv3ea: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.448067v1?rss=1 diff --git a/scripts/tests/example_logfiles/failed_resource_error.txt b/scripts/tests/example_logfiles/failed_resource_error.txt new file mode 100644 index 0000000..c2ba24c --- /dev/null +++ b/scripts/tests/example_logfiles/failed_resource_error.txt @@ -0,0 +1,2 @@ +[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404 + diff --git a/scripts/tests/example_logfiles/failed_sitedownloader_error.txt b/scripts/tests/example_logfiles/failed_sitedownloader_error.txt new file mode 100644 index 0000000..379ddac --- /dev/null +++ b/scripts/tests/example_logfiles/failed_sitedownloader_error.txt @@ -0,0 +1,2 @@ +[2021-06-12 08:38:35,657 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxr7x9: No images found in Reddit gallery +[2021-06-12 08:47:22,005 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxpn0h: Server responded with 503 to https://www.reddit.com/gallery/nxpkvh diff --git a/scripts/tests/example_logfiles/failed_write_error.txt b/scripts/tests/example_logfiles/failed_write_error.txt new file mode 100644 index 0000000..2462366 --- /dev/null +++ b/scripts/tests/example_logfiles/failed_write_error.txt @@ -0,0 +1 @@ +[2021-06-09 22:01:04,530 - bdfr.downloader - ERROR] - Failed to write file in submission nnboza to C:\Users\Yoga 14\path\to\output\ThotNetwork\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4: [Errno 2] No such file or directory: 'C:\\Users\\Yoga 14\\path\\to\\output\\ThotNetwork\\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4' diff --git a/scripts/tests/example_logfiles/succeed_already_exists.txt b/scripts/tests/example_logfiles/succeed_already_exists.txt new file mode 100644 index 0000000..e5713d7 --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_already_exists.txt @@ -0,0 +1,3 @@ +[2021-06-12 08:41:51,464 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxry0l.jpg from submission nxry0l already exists, continuing +[2021-06-12 08:41:51,469 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrlgn.gif from submission nxrlgn already exists, continuing +[2021-06-12 08:41:51,472 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrq9g.png from submission nxrq9g already exists, continuing diff --git a/scripts/tests/example_logfiles/succeed_download_filter.txt b/scripts/tests/example_logfiles/succeed_download_filter.txt new file mode 100644 index 0000000..ce4c41d --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_download_filter.txt @@ -0,0 +1,3 @@ +[2021-06-10 20:36:48,722 - bdfr.downloader - DEBUG] - Download filter removed nwfirr with URL https://www.youtube.com/watch?v=NVSiX0Tsees +[2021-06-12 19:56:36,848 - bdfr.downloader - DEBUG] - Download filter removed nwfgcl with URL https://www.reddit.com/r/MaliciousCompliance/comments/nwfgcl/new_guy_decided_to_play_manager_alright/ +[2021-06-12 19:56:28,587 - bdfr.downloader - DEBUG] - Download filter removed nxuxjy with URL https://www.reddit.com/r/MaliciousCompliance/comments/nxuxjy/you_want_an_omelette_with_nothing_inside_okay/ diff --git a/scripts/tests/example_logfiles/succeed_downloaded_submission.txt b/scripts/tests/example_logfiles/succeed_downloaded_submission.txt new file mode 100644 index 0000000..fde97fa --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_downloaded_submission.txt @@ -0,0 +1,7 @@ +[2021-06-12 11:58:53,864 - bdfr.downloader - INFO] - Downloaded submission nxui9y from tumblr +[2021-06-12 11:58:56,618 - bdfr.downloader - INFO] - Downloaded submission nxsr4r from tumblr +[2021-06-12 11:58:59,026 - bdfr.downloader - INFO] - Downloaded submission nxviir from tumblr +[2021-06-12 11:59:00,289 - bdfr.downloader - INFO] - Downloaded submission nxusva from tumblr +[2021-06-12 11:59:00,735 - bdfr.downloader - INFO] - Downloaded submission nxvko7 from tumblr +[2021-06-12 11:59:01,215 - bdfr.downloader - INFO] - Downloaded submission nxvd63 from tumblr +[2021-06-12 11:59:13,891 - bdfr.downloader - INFO] - Downloaded submission nn9cor from tumblr diff --git a/scripts/tests/example_logfiles/succeed_hard_link.txt b/scripts/tests/example_logfiles/succeed_hard_link.txt new file mode 100644 index 0000000..6359f6b --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_hard_link.txt @@ -0,0 +1 @@ +[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Hard link made linking /media/smaug/private/reddit/tumblr/nwnp2n.jpg to /media/smaug/private/reddit/tumblr/nwskqb.jpg in submission nwnp2n diff --git a/scripts/tests/example_logfiles/succeed_resource_hash.txt b/scripts/tests/example_logfiles/succeed_resource_hash.txt new file mode 100644 index 0000000..a089750 --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_resource_hash.txt @@ -0,0 +1 @@ +[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Resource hash aaaaaaaaaaaaaaaaaaaaaaa from submission n86jk8 downloaded elsewhere diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index e69de29..75b9bff 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -0,0 +1,43 @@ +setup() { + load ./test_helper/bats-support/load + load ./test_helper/bats-assert/load +} + +teardown() { + rm -f failed.txt +} + +@test "fail run no logfile" { + run ../extract_failed_ids.sh + assert_failure +} + +@test "fail no downloader module" { + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail resource error" { + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail site downloader error" { + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail failed file write" { + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} + +@test "fail disabled module" { + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; +} diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index e69de29..364bedb 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -0,0 +1,38 @@ +setup() { + load ./test_helper/bats-support/load + load ./test_helper/bats-assert/load +} + +teardown() { + rm -f successful.txt +} + +@test "success downloaded submission" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success resource hash" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success download filter" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success already exists" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} + +@test "success hard link" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} From fc42587a8f34367cffeca324cc00d8cef0105df5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 13 Jun 2021 13:10:13 +1000 Subject: [PATCH 065/150] Add information to sub-README --- scripts/tests/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/tests/README.md b/scripts/tests/README.md index e69de29..8349f7a 100644 --- a/scripts/tests/README.md +++ b/scripts/tests/README.md @@ -0,0 +1,13 @@ +# Bash Scripts Testing + +The `bats` framework is included and used to test the scripts included, specifically the scripts designed to parse through the logging output. As this involves delicate regex and indexes, it is necessary to test these. + +## Running Tests + +Running the tests are easy, and can be done with a single command. Once the working directory is this directory, run the following command. + +```bash +./bats/bin/bats *.bats +``` + +This will run all test files that have the `.bats` suffix. From e5be624f1e2f4d985f50b53c3c72ab78e6988721 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 23 Jun 2021 14:30:39 +1000 Subject: [PATCH 066/150] Check submission URL against filter before factory --- bdfr/downloader.py | 5 ++++- tests/test_integration.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index ab6bf56..f4220db 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -54,6 +54,9 @@ class RedditDownloader(RedditConnector): elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return + elif not self.download_filter.check_url(submission.url): + logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}') + return logger.debug(f'Attempting to download submission {submission.id}') try: @@ -76,7 +79,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'File {destination} from submission {submission.id} already exists, continuing') continue elif not self.download_filter.check_resource(res): - logger.debug(f'Download filter removed {submission.id} with URL {submission.url}') + logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: res.download(self.args.max_wait_time) diff --git a/tests/test_integration.py b/tests/test_integration.py index 0a6de3d..6bad3f6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -189,6 +189,20 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): assert 'Download filter removed ' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'], +)) +def test_cli_download_download_filter_domain(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'filtered due to URL' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow From ccafebf5fede757198a03cd460d887d13e15fedb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 23 Jun 2021 14:59:26 +1000 Subject: [PATCH 067/150] Update test --- tests/site_downloaders/test_youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 986b0db..afaebb7 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -14,7 +14,7 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), - ('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'), + ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 3dacaf0872cbb61b29dd73506b74dc758c77732e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:12:41 +1000 Subject: [PATCH 068/150] Fix renamed option in test --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 6bad3f6..3ecfcd6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -270,7 +270,7 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--full-context', '--link', 'gxqapql'], + ['--comment-context', '--link', 'gxqapql'], )) def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): runner = CliRunner() From 31be3a916e1bd4f8d77a137eba3589d80b611f48 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:14:05 +1000 Subject: [PATCH 069/150] Enable integration tests to be run concurrently --- tests/test_integration.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3ecfcd6..0b4d36b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ # coding=utf-8 import re +import shutil from pathlib import Path import pytest @@ -12,22 +13,28 @@ from bdfr.__main__ import cli does_test_config_exist = Path('test_config.cfg').exists() +def copy_test_config(tmp_path: Path): + shutil.copy(Path('test_config.cfg'), Path(tmp_path, 'test_config.cfg')) + + def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): + copy_test_config(tmp_path) out = [ 'download', str(tmp_path), '-v', - '--config', 'test_config.cfg', + '--config', str(Path(tmp_path, 'test_config.cfg')), '--log', str(Path(tmp_path, 'test_log.txt')), ] + test_args return out def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): + copy_test_config(tmp_path) out = [ 'archive', str(tmp_path), '-v', - '--config', 'test_config.cfg', + '--config', str(Path(tmp_path, 'test_config.cfg')), '--log', str(Path(tmp_path, 'test_log.txt')), ] + test_args return out From 1d187fcf656f8237d73c289af9f45d5e95ac3ad3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:14:21 +1000 Subject: [PATCH 070/150] Consolidate tests --- tests/test_integration.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 0b4d36b..5465c5b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -187,27 +187,14 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.parametrize('test_args', ( ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], ['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'], + ['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'], )) def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'Download filter removed ' in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'], -)) -def test_cli_download_download_filter_domain(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_download_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'filtered due to URL' in result.output + assert any((string in result.output for string in ('Download filter removed ', 'filtered due to URL'))) @pytest.mark.online From 640001a7f57d186032822c4482d5b9f0f349be5b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:37:25 +1000 Subject: [PATCH 071/150] Speed up test --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 5465c5b..19d884d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -170,7 +170,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--subreddit', 'python', '-L', 10, '--search-existing'], + ['--subreddit', 'python', '-L', 1, '--search-existing'], )) def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): Path(tmp_path, 'test.txt').touch() From 8b1a3d9abcc83e39cfdc1c58702d7686d46c5aba Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 16:38:34 +1000 Subject: [PATCH 072/150] Enable integration tests to be run concurrently --- tests/test_integration.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 19d884d..5f1dfea 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,29 +13,29 @@ from bdfr.__main__ import cli does_test_config_exist = Path('test_config.cfg').exists() -def copy_test_config(tmp_path: Path): - shutil.copy(Path('test_config.cfg'), Path(tmp_path, 'test_config.cfg')) +def copy_test_config(run_path: Path): + shutil.copy(Path('test_config.cfg'), Path(run_path, 'test_config.cfg')) -def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path): - copy_test_config(tmp_path) +def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) out = [ - 'download', str(tmp_path), + 'download', str(run_path), '-v', - '--config', str(Path(tmp_path, 'test_config.cfg')), - '--log', str(Path(tmp_path, 'test_log.txt')), + '--config', str(Path(run_path, 'test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out -def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): - copy_test_config(tmp_path) +def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) out = [ 'archive', - str(tmp_path), + str(run_path), '-v', - '--config', str(Path(tmp_path, 'test_config.cfg')), - '--log', str(Path(tmp_path, 'test_log.txt')), + '--config', str(Path(run_path, 'test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out From 1a52dfdcbcd99e10a5f4ef28869b8b3f893ac51e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Jun 2021 17:47:49 +1000 Subject: [PATCH 073/150] Add PornHub module --- bdfr/site_downloaders/download_factory.py | 3 ++ bdfr/site_downloaders/pornhub.py | 30 +++++++++++++++++++ .../site_downloaders/test_download_factory.py | 2 ++ tests/site_downloaders/test_pornhub.py | 25 ++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 bdfr/site_downloaders/pornhub.py create mode 100644 tests/site_downloaders/test_pornhub.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 41813f9..911e8fb 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -13,6 +13,7 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.youtube import Youtube @@ -43,6 +44,8 @@ class DownloadFactory: return Youtube elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct + elif re.match(r'pornhub\.com.*', sanitised_url): + return PornHub elif YoutubeDlFallback.can_handle_link(sanitised_url): return YoutubeDlFallback else: diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py new file mode 100644 index 0000000..924a6b8 --- /dev/null +++ b/bdfr/site_downloaders/pornhub.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging +import tempfile +from pathlib import Path +from typing import Optional + +import youtube_dl +from praw.models import Submission + +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.youtube import Youtube + +logger = logging.getLogger(__name__) + + +class PornHub(Youtube): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + ytdl_options = { + 'format': 'best', + 'nooverwrites': True, + } + out = self._download_video(ytdl_options) + return [out] diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 4b5356c..95b522d 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -13,6 +13,7 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.youtube import Youtube @@ -44,6 +45,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://streamable.com/dt46y', YoutubeDlFallback), ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), + ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py new file mode 100644 index 0000000..12144dd --- /dev/null +++ b/tests/site_downloaders/test_pornhub.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import MagicMock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.pornhub import PornHub + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'), +)) +def test_find_resources_good(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = PornHub(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download(120) + assert resources[0].hash.hexdigest() == expected_hash From e8998da2f00ca34cc53e899a2640904b1d5c721b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 24 Jun 2021 20:10:31 +1000 Subject: [PATCH 074/150] Catch some Imgur errors with weird links --- bdfr/site_downloaders/imgur.py | 5 +++-- tests/site_downloaders/test_imgur.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index bd974be..44a62f1 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -37,9 +37,10 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - if re.match(r'.*\.gifv$', link): + link = link.rstrip('?') + if re.match(r'(?i).*\.gifv$', link): link = link.replace('i.imgur', 'imgur') - link = re.sub('\\.gifv$', '', link) + link = re.sub('(?i)\\.gifv$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 0e557ed..aa93795 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -132,10 +132,16 @@ def test_imgur_extension_validation_bad(test_extension: str): ), ( 'https://i.imgur.com/lFJai6i.gifv', - ( - '01a6e79a30bec0e644e5da12365d5071', - ), - ) + ('01a6e79a30bec0e644e5da12365d5071',), + ), + ( + 'https://i.imgur.com/ywSyILa.gifv?', + ('56d4afc32d2966017c38d98568709b45',), + ), + ( + 'https://imgur.com/ubYwpbk.GIFV', + ('d4a774aac1667783f9ed3a1bd02fac0c',), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From d53b3b7274554b6750c1fe301f00085933172a51 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Jun 2021 14:00:10 +1000 Subject: [PATCH 075/150] Update gallery code to work with NSFW galleries --- bdfr/site_downloaders/gallery.py | 29 ++++++------ tests/site_downloaders/test_gallery.py | 64 ++++++++++++++------------ 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 2c59c05..b3bae26 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -5,6 +5,7 @@ import re from typing import Optional import bs4 +import requests from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -20,21 +21,21 @@ class Gallery(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - image_urls = self._get_links(self.post.url) + image_urls = self._get_links(self.post.gallery_data['items']) if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] - @staticmethod - def _get_links(url: str) -> list[str]: - resource_headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - } - page = Gallery.retrieve_url(url, headers=resource_headers) - soup = bs4.BeautifulSoup(page.text, 'html.parser') - - links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) - links = [link.get('href') for link in links] - return links + @ staticmethod + def _get_links(id_dict: list[dict]) -> list[str]: + out = [] + for item in id_dict: + image_id = item['media_id'] + possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg') + for extension in possible_extensions: + test_url = f'https://i.redd.it/{image_id}{extension}' + response = requests.head(test_url) + if response.status_code == 200: + out.append(test_url) + break + return out diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index e903e04..857f148 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -8,30 +8,32 @@ from bdfr.site_downloaders.gallery import Gallery @pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected'), ( - ('https://www.reddit.com/gallery/m6lvrh', { - 'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8', - 'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805', - 'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76', - 'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&' - 'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444' +@pytest.mark.parametrize(('test_ids', 'expected'), ( + ([ + {'media_id': '18nzv9ch0hn61'}, + {'media_id': 'jqkizcch0hn61'}, + {'media_id': 'k0fnqzbh0hn61'}, + {'media_id': 'm3gamzbh0hn61'}, + ], { + 'https://i.redd.it/18nzv9ch0hn61.jpg', + 'https://i.redd.it/jqkizcch0hn61.jpg', + 'https://i.redd.it/k0fnqzbh0hn61.jpg', + 'https://i.redd.it/m3gamzbh0hn61.jpg' }), - ('https://www.reddit.com/gallery/ljyy27', { - 'https://preview.redd.it/04vxj25uqih61.png?width=92&' - 'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4', - 'https://preview.redd.it/0fnx83kpqih61.png?width=241&' - 'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a', - 'https://preview.redd.it/7zkmr1wqqih61.png?width=237&' - 'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0', - 'https://preview.redd.it/u37k5gxrqih61.png?width=443&' - 'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862' + ([ + {'media_id': '04vxj25uqih61'}, + {'media_id': '0fnx83kpqih61'}, + {'media_id': '7zkmr1wqqih61'}, + {'media_id': 'u37k5gxrqih61'}, + ], { + 'https://i.redd.it/04vxj25uqih61.png', + 'https://i.redd.it/0fnx83kpqih61.png', + 'https://i.redd.it/7zkmr1wqqih61.png', + 'https://i.redd.it/u37k5gxrqih61.png' }), )) -def test_gallery_get_links(test_url: str, expected: set[str]): - results = Gallery._get_links(test_url) +def test_gallery_get_links(test_ids: list[dict], expected: set[str]): + results = Gallery._get_links(test_ids) assert set(results) == expected @@ -39,16 +41,20 @@ def test_gallery_get_links(test_url: str, expected: set[str]): @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), ( ('m6lvrh', { - '6c8a892ae8066cbe119218bcaac731e1', - '93ce177f8cb7994906795f4615114d13', - '9a293adf19354f14582608cf22124574', - 'b73e2c3daee02f99404644ea02f1ae65' + '5c42b8341dd56eebef792e86f3981c6a', + '8f38d76da46f4057bf2773a778e725ca', + 'f5776f8f90491c8b770b8e0a6bfa49b3', + 'fa1a43c94da30026ad19a9813a0ed2c2', }), ('ljyy27', { - '1bc38bed88f9c4770e22a37122d5c941', - '2539a92b78f3968a069df2dffe2279f9', - '37dea50281c219b905e46edeefc1a18d', - 'ec4924cf40549728dcf53dd40bc7a73c' + '359c203ec81d0bc00e675f1023673238', + '79262fd46bce5bfa550d878a3b898be4', + '808c35267f44acb523ce03bfa5687404', + 'ec8b65bdb7f1279c4b3af0ea2bbb30c3', + }), + ('nxyahw', { + 'b89a3f41feb73ec1136ec4ffa7353eb1', + 'cabb76fd6fd11ae6e115a2039eb09f04', }), )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): From 528f5c567db3964cd9a25e549d3dbcbf099df9ac Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 11:59:38 +1000 Subject: [PATCH 076/150] Add additional test for Redgifs --- tests/site_downloaders/test_redgifs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 71fc18e..476149f 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -31,6 +31,7 @@ def test_get_link(test_url: str, expected: str): ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), + ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock() From ffd07f38ba02286df3400e98267bd7765d263597 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 12:52:27 +1000 Subject: [PATCH 077/150] Fix broken subreddit test --- tests/test_connector.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 2249b96..e561b96 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -29,6 +29,8 @@ def downloader_mock(args: Configuration): downloader_mock = MagicMock() downloader_mock.args = args downloader_mock.sanitise_subreddit_name = RedditConnector.sanitise_subreddit_name + downloader_mock.create_filtered_listing_generator = lambda x: RedditConnector.create_filtered_listing_generator( + downloader_mock, x) downloader_mock.split_args_input = RedditConnector.split_args_input downloader_mock.master_hash_list = {} return downloader_mock @@ -37,6 +39,7 @@ def downloader_mock(args: Configuration): def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]) -> list: results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) + assert not any([isinstance(m, MagicMock) for m in results]) if result_limit is not None: assert len(results) == result_limit return results @@ -167,18 +170,20 @@ def test_get_subreddit_normal( downloader_mock: MagicMock, reddit_instance: praw.Reddit, ): - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.args.limit = limit downloader_mock.args.sort = sort_type + downloader_mock.time_filter = RedditConnector.create_time_filter(downloader_mock) + downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) + downloader_mock.determine_sort_function.return_value = RedditConnector.determine_sort_function(downloader_mock) downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance - downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) results = RedditConnector.get_subreddits(downloader_mock) - test_subreddits = downloader_mock._split_args_input(test_subreddits) + test_subreddits = downloader_mock.split_args_input(test_subreddits) results = [sub for res1 in results for sub in res1] assert all([isinstance(res1, praw.models.Submission) for res1 in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -212,6 +217,7 @@ def test_get_subreddit_search( assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -243,6 +249,7 @@ def test_get_multireddits_public( results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) assert len(results) == limit + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online @@ -268,6 +275,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic results = RedditConnector.get_user_data(downloader_mock) results = assert_all_results_are_submissions(limit, results) assert all([res.author.name == test_user for res in results]) + assert not any([isinstance(m, MagicMock) for m in results]) @pytest.mark.online From 469a7783b86ab0759b01276c086a282815731c25 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:00:48 +1000 Subject: [PATCH 078/150] Split integration tests --- tests/integration_tests/__init__.py | 2 + .../test_archive_integration.py | 108 +++++++++++++++ .../test_clone_integration.py | 44 +++++++ .../test_download_integration.py} | 124 +----------------- 4 files changed, 157 insertions(+), 121 deletions(-) create mode 100644 tests/integration_tests/__init__.py create mode 100644 tests/integration_tests/test_archive_integration.py create mode 100644 tests/integration_tests/test_clone_integration.py rename tests/{test_integration.py => integration_tests/test_download_integration.py} (71%) diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py new file mode 100644 index 0000000..d4c1799 --- /dev/null +++ b/tests/integration_tests/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# coding=utf-8 diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py new file mode 100644 index 0000000..8cbb2d5 --- /dev/null +++ b/tests/integration_tests/test_archive_integration.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +import shutil +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from bdfr.__main__ import cli + +does_test_config_exist = Path('../test_config.cfg').exists() + + +def copy_test_config(run_path: Path): + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + + +def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): + copy_test_config(run_path) + out = [ + 'archive', + str(run_path), + '-v', + '--config', str(Path(run_path, '../test_config.cfg')), + '--log', str(Path(run_path, 'test_log.txt')), + ] + test_args + return out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'gstd4hk'], + ['-l', 'm2601g', '-f', 'yaml'], + ['-l', 'n60t4c', '-f', 'xml'], +)) +def test_cli_archive_single(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'Mindustry', '-L', 25], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'], + ['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'], +)) +def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], + ['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'], +)) +def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--comment-context', '--link', 'gxqapql'], +)) +def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Converting comment' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.slow +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'all', '-L', 100], + ['--subreddit', 'all', '-L', 100, '--sort', 'new'], +)) +def test_cli_archive_long(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py new file mode 100644 index 0000000..84892fc --- /dev/null +++ b/tests/integration_tests/test_clone_integration.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +import shutil +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from bdfr.__main__ import cli + +does_test_config_exist = Path('../test_config.cfg').exists() + + +def copy_test_config(run_path: Path): + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + + +def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): + out = [ + 'clone', + str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args + return out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g'], + ['-s', 'TrollXChromosomes/', '-L', 1], +)) +def test_cli_scrape_general(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output + assert 'Record for entry item' in result.output diff --git a/tests/test_integration.py b/tests/integration_tests/test_download_integration.py similarity index 71% rename from tests/test_integration.py rename to tests/integration_tests/test_download_integration.py index 5f1dfea..fca0f8b 100644 --- a/tests/test_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -10,11 +10,11 @@ from click.testing import CliRunner from bdfr.__main__ import cli -does_test_config_exist = Path('test_config.cfg').exists() +does_test_config_exist = Path('../test_config.cfg').exists() def copy_test_config(run_path: Path): - shutil.copy(Path('test_config.cfg'), Path(run_path, 'test_config.cfg')) + shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @@ -22,35 +22,12 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): out = [ 'download', str(run_path), '-v', - '--config', str(Path(run_path, 'test_config.cfg')), + '--config', str(Path(run_path, '../test_config.cfg')), '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out -def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): - copy_test_config(run_path) - out = [ - 'archive', - str(run_path), - '-v', - '--config', str(Path(run_path, 'test_config.cfg')), - '--log', str(Path(run_path, 'test_log.txt')), - ] + test_args - return out - - -def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): - out = [ - 'clone', - str(tmp_path), - '-v', - '--config', 'test_config.cfg', - '--log', str(Path(tmp_path, 'test_log.txt')), - ] + test_args - return out - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @@ -211,85 +188,6 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['-l', 'gstd4hk'], - ['-l', 'm2601g', '-f', 'yaml'], - ['-l', 'n60t4c', '-f', 'xml'], -)) -def test_cli_archive_single(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--subreddit', 'Mindustry', '-L', 25], - ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], - ['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'], - ['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'], - ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'], - ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'], -)) -def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], - ['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'], -)) -def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--comment-context', '--link', 'gxqapql'], -)) -def test_cli_archive_full_context(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'Converting comment' in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.slow -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['--subreddit', 'all', '-L', 100], - ['--subreddit', 'all', '-L', 100, '--sort', 'new'], -)) -def test_cli_archive_long(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_archive_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert re.search(r'Writing entry .*? to file in .*? format', result.output) - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow @@ -393,19 +291,3 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'skipped due to disabled module' in result.output assert 'Downloaded submission' not in result.output - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') -@pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g'], - ['-s', 'TrollXChromosomes/', '-L', 1], -)) -def test_cli_scrape_general(test_args: list[str], tmp_path: Path): - runner = CliRunner() - test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) - result = runner.invoke(cli, test_args) - assert result.exit_code == 0 - assert 'Downloaded submission' in result.output - assert 'Record for entry item' in result.output From edfeb653a4780f4dade685d8bba2ac4e451bc450 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 1 Jul 2021 13:11:31 +1000 Subject: [PATCH 079/150] Record user flair in comment archive entries --- bdfr/archive_entry/base_archive_entry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bdfr/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py index 7b84fbe..516e5d0 100644 --- a/bdfr/archive_entry/base_archive_entry.py +++ b/bdfr/archive_entry/base_archive_entry.py @@ -22,6 +22,7 @@ class BaseArchiveEntry(ABC): 'id': in_comment.id, 'score': in_comment.score, 'subreddit': in_comment.subreddit.display_name, + 'author_flair': in_comment.author_flair_text, 'submission': in_comment.submission.id, 'stickied': in_comment.stickied, 'body': in_comment.body, From bd34c37052f7b0bf2e678051fb464a49dbd0adcc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 30 Jun 2021 12:21:17 +1000 Subject: [PATCH 080/150] Add exception for special friends subreddit --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 68efc0c..8a6f0bf 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -394,7 +394,7 @@ class RedditConnector(metaclass=ABCMeta): @staticmethod def check_subreddit_status(subreddit: praw.models.Subreddit): - if subreddit.display_name == 'all': + if subreddit.display_name in ('all', 'friends'): return try: assert subreddit.id From c4aa6177372e73a469e58cdc5de57675de69a7a8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:17:13 +1000 Subject: [PATCH 081/150] Add test for friends subreddit --- .../test_download_integration.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index fca0f8b..56da1d5 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -58,6 +58,21 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): assert 'Added submissions from subreddit ' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'friends', '-L', 10, '--authenticate'], +)) +def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from subreddit ' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') From 8db9d0bcc4119d22b3fcb4e6810d737bf981957c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:29:39 +1000 Subject: [PATCH 082/150] Add test for unauthenticated instances --- bdfr/connector.py | 3 +++ tests/integration_tests/test_download_integration.py | 1 + 2 files changed, 4 insertions(+) diff --git a/bdfr/connector.py b/bdfr/connector.py index 8a6f0bf..a4165fc 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -242,6 +242,9 @@ class RedditConnector(metaclass=ABCMeta): if self.args.subreddit: out = [] for reddit in self.split_args_input(self.args.subreddit): + if reddit == 'friends' and self.authenticated is False: + logger.error('Cannot read friends subreddit without an authenticated instance') + continue try: reddit = self.reddit_instance.subreddit(reddit) try: diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 56da1d5..4ee0bba 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -214,6 +214,7 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): ['--subreddit', 'submitters', '-L', 10], # Private subreddit ['--subreddit', 'donaldtrump', '-L', 10], # Banned subreddit ['--user', 'djnish', '--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], + ['--subreddit', 'friends', '-L', 10], )) def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): runner = CliRunner() From 1319eeb6dafcaf4327e65dc9c8faf033c2e0aaf0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:53:02 +1000 Subject: [PATCH 083/150] Fix error with crossposted Reddit galleries --- bdfr/site_downloaders/gallery.py | 11 ++++++++++- tests/site_downloaders/test_gallery.py | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index b3bae26..1070419 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -21,7 +21,16 @@ class Gallery(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - image_urls = self._get_links(self.post.gallery_data['items']) + try: + image_urls = self._get_links(self.post.gallery_data['items']) + except AttributeError: + try: + image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) + except (AttributeError, IndexError): + logger.error(f'Could not find gallery data in submission {self.post.id}') + logger.exception('Gallery image find failure') + raise SiteDownloaderError('No images found in Reddit gallery') + if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 857f148..51045f8 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -56,6 +56,10 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): 'b89a3f41feb73ec1136ec4ffa7353eb1', 'cabb76fd6fd11ae6e115a2039eb09f04', }), + ('obkflw', { + '65163f685fb28c5b776e0e77122718be', + '2a337eb5b13c34d3ca3f51b5db7c13e9', + }), )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 390ce57f461db854d3ff062ded10bdc5fe8d52e3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:57:10 +1000 Subject: [PATCH 084/150] Remove redundant parenthesis --- tests/site_downloaders/test_youtube.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index afaebb7..f3a97e1 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -28,8 +28,9 @@ def test_find_resources_good(test_url: str, expected_hash: str): @pytest.mark.online -@pytest.mark.parametrize(('test_url'), ( - ('https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman-interview-oj-simpson-goliath-chronicles'), +@pytest.mark.parametrize('test_url', ( + 'https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman' + '-interview-oj-simpson-goliath-chronicles', )) def test_find_resources_bad(test_url: str): test_submission = MagicMock() From 6efcf1ce7e310a4475e68a83063f897324a93e5b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:58:20 +1000 Subject: [PATCH 085/150] Remove unused imports --- bdfr/site_downloaders/gallery.py | 2 -- bdfr/site_downloaders/pornhub.py | 4 ---- bdfr/site_downloaders/redgifs.py | 1 - tests/integration_tests/test_clone_integration.py | 1 - tests/integration_tests/test_download_integration.py | 1 - tests/test_downloader.py | 1 - 6 files changed, 10 deletions(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 1070419..df161e5 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 import logging -import re from typing import Optional -import bs4 import requests from praw.models import Submission diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index 924a6b8..6658d7e 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -2,14 +2,10 @@ # coding=utf-8 import logging -import tempfile -from pathlib import Path from typing import Optional -import youtube_dl from praw.models import Submission -from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.youtube import Youtube diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 051bc12..9cfec02 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -4,7 +4,6 @@ import json import re from typing import Optional -from bs4 import BeautifulSoup from praw.models import Submission from bdfr.exceptions import SiteDownloaderError diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 84892fc..343b2d3 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import re import shutil from pathlib import Path diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 4ee0bba..305fe99 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import re import shutil from pathlib import Path diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d67aee6..e5f0a31 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,7 +9,6 @@ from unittest.mock import MagicMock, patch import praw.models import pytest -import bdfr.site_downloaders.download_factory from bdfr.__main__ import setup_logging from bdfr.configuration import Configuration from bdfr.connector import RedditConnector From aa55a92791df149023af971670f2f4e9196cd13d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 14:58:56 +1000 Subject: [PATCH 086/150] Remove unused local variables --- bdfr/connector.py | 2 +- bdfr/site_downloaders/youtube.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index a4165fc..5628e94 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -210,7 +210,7 @@ class RedditConnector(metaclass=ABCMeta): if log_path.exists(): try: file_handler.doRollover() - except PermissionError as e: + except PermissionError: logger.critical( 'Cannot rollover logfile, make sure this is the only ' 'BDFR process or specify alternate logfile location') diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index e12fdc1..8b93b23 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -43,7 +43,6 @@ class Youtube(BaseDownloader): except youtube_dl.DownloadError as e: raise SiteDownloaderError(f'Youtube download failed: {e}') - downloaded_file = None downloaded_files = list(download_path.iterdir()) if len(downloaded_files) > 0: downloaded_file = downloaded_files[0] From d5ef991b3abe093e8d2a8fcfd6e8251e0c5702f2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 2 Jul 2021 15:11:09 +1000 Subject: [PATCH 087/150] Catch additional error in galleries --- bdfr/site_downloaders/gallery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index df161e5..62fec60 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -24,7 +24,7 @@ class Gallery(BaseDownloader): except AttributeError: try: image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) - except (AttributeError, IndexError): + except (AttributeError, IndexError, TypeError): logger.error(f'Could not find gallery data in submission {self.post.id}') logger.exception('Gallery image find failure') raise SiteDownloaderError('No images found in Reddit gallery') From 7f1c929a080b2462897c5f998ba4fac6b9a60fa4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Jul 2021 13:54:26 +1000 Subject: [PATCH 088/150] Add fallback scope --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 5628e94..1eb91c8 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -119,7 +119,7 @@ class RedditConnector(metaclass=ABCMeta): logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): logger.log(9, 'Commencing OAuth2 authentication') - scopes = self.cfg_parser.get('DEFAULT', 'scopes') + scopes = self.cfg_parser.get('DEFAULT', 'scopes', fallback='identity, history, read, save') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( scopes, From d03a5e556e4699c84ed0f72994c78677a7020944 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Jul 2021 10:59:35 +1000 Subject: [PATCH 089/150] Stop writing new value to config --- bdfr/connector.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 1eb91c8..d6d43dd 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -90,10 +90,7 @@ class RedditConnector(metaclass=ABCMeta): def read_config(self): """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: - if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): - self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') - logger.log(9, 'Wrote default download wait time download to config file') - self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') + self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120) logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') if self.args.time_format is None: option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') From 2f8ca766c604ff69227bdc69d5c67fba38d01e3d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Jul 2021 11:00:02 +1000 Subject: [PATCH 090/150] Update regex --- bdfr/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index d6d43dd..0e78c8c 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -94,7 +94,7 @@ class RedditConnector(metaclass=ABCMeta): logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') if self.args.time_format is None: option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') - if re.match(r'^[ \'\"]*$', option): + if re.match(r'^[\s\'\"]*$', option): option = 'ISO' logger.debug(f'Setting datetime format string to {option}') self.args.time_format = option From 381e3c29fad3de00f6e5be58539b78c29125becf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jul 2021 14:42:10 +1000 Subject: [PATCH 091/150] Fix test where comments in saved list --- tests/test_connector.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index e561b96..15eede1 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -45,6 +45,15 @@ def assert_all_results_are_submissions(result_limit: int, results: list[Iterator return results +def assert_all_results_are_submissions_or_comments(result_limit: int, results: list[Iterator]) -> list: + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) or isinstance(res, praw.models.Comment) for res in results]) + assert not any([isinstance(m, MagicMock) for m in results]) + if result_limit is not None: + assert len(results) == result_limit + return results + + def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): downloader_mock.args.directory = tmp_path / 'test' downloader_mock.config_directories.user_config_dir = tmp_path @@ -297,7 +306,7 @@ def test_get_user_authenticated_lists( downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.user = [RedditConnector.resolve_user_name(downloader_mock, 'me')] results = RedditConnector.get_user_data(downloader_mock) - assert_all_results_are_submissions(10, results) + assert_all_results_are_submissions_or_comments(10, results) @pytest.mark.parametrize(('test_name', 'expected'), ( From 8826fc5aa9838101a668e2054a70c153bf8d0301 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jul 2021 14:42:20 +1000 Subject: [PATCH 092/150] Fix outdated test --- tests/site_downloaders/test_imgur.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index aa93795..94bd240 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -65,11 +65,11 @@ def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_d {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} ), ( - 'https://imgur.com/BuzvZwb.gifv', + 'https://imgur.com/65FqTpT.gifv', { - 'hash': 'BuzvZwb', + 'hash': '65FqTpT', 'title': '', - 'description': 'Akron Glass Works', + 'description': '', 'animated': True, 'mimetype': 'video/mp4' }, From d6e45de09bf6b9b6f007bd66d10d003836ccdc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 18 Jul 2021 10:12:55 +0300 Subject: [PATCH 093/150] Version 2.3 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2969fe0..aa847a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.2.0 +version = 2.3.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 77aaee96f3dbfbbd042948d285ebca7ff1055424 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 19 Jul 2021 18:44:54 +1000 Subject: [PATCH 094/150] Fix bug with deleted galleries --- bdfr/site_downloaders/gallery.py | 2 +- tests/site_downloaders/test_gallery.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 62fec60..cd34416 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -21,7 +21,7 @@ class Gallery(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: image_urls = self._get_links(self.post.gallery_data['items']) - except AttributeError: + except (AttributeError, TypeError): try: image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) except (AttributeError, IndexError, TypeError): diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 51045f8..f84650d 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -4,6 +4,7 @@ import praw import pytest +from bdfr.exceptions import SiteDownloaderError from bdfr.site_downloaders.gallery import Gallery @@ -68,3 +69,13 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re [res.download(120) for res in results] hashes = [res.hash.hexdigest() for res in results] assert set(hashes) == expected_hashes + + +@pytest.mark.parametrize('test_id', ( + 'n0pyzp', +)) +def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_id) + gallery = Gallery(test_submission) + with pytest.raises(SiteDownloaderError): + gallery.find_resources() From 1a4ff07f78f51dfbbe70c089b440e1c0f169be08 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Jul 2021 16:58:33 +1000 Subject: [PATCH 095/150] Add ability to read IDs from files --- bdfr/__main__.py | 11 ++++++----- bdfr/configuration.py | 1 + bdfr/connector.py | 16 +++++++++++----- .../test_download_integration.py | 14 ++++++++++++++ tests/test_connector.py | 5 ++--- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 67e4f99..367f8c6 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -6,9 +6,9 @@ import sys import click from bdfr.archiver import Archiver +from bdfr.cloner import RedditCloner from bdfr.configuration import Configuration from bdfr.downloader import RedditDownloader -from bdfr.cloner import RedditCloner logger = logging.getLogger() @@ -17,6 +17,7 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--include-id-file', multiple=True, default=None), click.option('--log', type=str, default=None), click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), @@ -26,12 +27,12 @@ _common_options = [ click.option('-L', '--limit', default=None, type=int), click.option('-l', '--link', multiple=True, default=None, type=str), click.option('-m', '--multireddit', multiple=True, default=None, type=str), + click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')), + default=None), click.option('-s', '--subreddit', multiple=True, default=None, type=str), - click.option('-v', '--verbose', default=None, count=True), - click.option('-u', '--user', type=str, multiple=True, default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), - click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', - 'controversial', 'rising', 'relevance')), default=None), + click.option('-u', '--user', type=str, multiple=True, default=None), + click.option('-v', '--verbose', default=None, count=True), ] _downloader_options = [ diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 36a1860..bc4c541 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -18,6 +18,7 @@ class Configuration(Namespace): self.exclude_id_file = [] self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.folder_scheme: str = '{SUBREDDIT}' + self.include_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.log: Optional[str] = None diff --git a/bdfr/connector.py b/bdfr/connector.py index 0e78c8c..a379847 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -3,6 +3,7 @@ import configparser import importlib.resources +import itertools import logging import logging.handlers import re @@ -78,7 +79,12 @@ class RedditConnector(metaclass=ABCMeta): self.create_reddit_instance() self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user])) - self.excluded_submission_ids = self.read_excluded_ids() + self.excluded_submission_ids = set.union( + self.read_id_files(self.args.exclude_id_file), + set(self.args.exclude_id), + ) + + self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file))) self.master_hash_list = {} self.authenticator = self.create_authenticator() @@ -403,13 +409,13 @@ class RedditConnector(metaclass=ABCMeta): except prawcore.Forbidden: raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') - def read_excluded_ids(self) -> set[str]: + @staticmethod + def read_id_files(file_locations: list[str]) -> set[str]: out = [] - out.extend(self.args.exclude_id) - for id_file in self.args.exclude_id_file: + for id_file in file_locations: id_file = Path(id_file).resolve().expanduser() if not id_file.exists(): - logger.warning(f'ID exclusion file at {id_file} does not exist') + logger.warning(f'ID file at {id_file} does not exist') continue with open(id_file, 'r') as file: for line in file: diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 305fe99..cb4a273 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -306,3 +306,17 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'skipped due to disabled module' in result.output assert 'Downloaded submission' not in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +def test_cli_download_include_id_file(tmp_path: Path): + test_file = Path(tmp_path, 'include.txt') + test_args = ['--include-id-file', str(test_file)] + test_file.write_text('odr9wg\nody576') + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output diff --git a/tests/test_connector.py b/tests/test_connector.py index 15eede1..2dd76f9 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -339,11 +339,10 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se assert results == expected -def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): +def test_read_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): test_file = tmp_path / 'test.txt' test_file.write_text('aaaaaa\nbbbbbb') - downloader_mock.args.exclude_id_file = [test_file] - results = RedditConnector.read_excluded_ids(downloader_mock) + results = RedditConnector.read_id_files([str(test_file)]) assert results == {'aaaaaa', 'bbbbbb'} From 7a1663db51895e849aa112ec51b6b2c8a4301da7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Jul 2021 17:02:19 +1000 Subject: [PATCH 096/150] Update README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index be4f455..89a4e90 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names +- `--include-id-file` + - This will add any submission with the IDs in the files provided + - Can be specified multiple times + - Format is one ID per line - `--log` - This allows one to specify the location of the logfile - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below From 44453b1707abc6559b3d9bc05d4ad53c8ffc7fbe Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 13:12:50 +1000 Subject: [PATCH 097/150] Update tests --- tests/site_downloaders/test_gallery.py | 5 +---- tests/site_downloaders/test_gfycat.py | 2 -- tests/site_downloaders/test_redgifs.py | 9 +++------ 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index f84650d..4e5d9f1 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -53,10 +53,6 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): '808c35267f44acb523ce03bfa5687404', 'ec8b65bdb7f1279c4b3af0ea2bbb30c3', }), - ('nxyahw', { - 'b89a3f41feb73ec1136ec4ffa7353eb1', - 'cabb76fd6fd11ae6e115a2039eb09f04', - }), ('obkflw', { '65163f685fb28c5b776e0e77122718be', '2a337eb5b13c34d3ca3f51b5db7c13e9', @@ -73,6 +69,7 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re @pytest.mark.parametrize('test_id', ( 'n0pyzp', + 'nxyahw', )) def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_id) diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 56aa2d0..3a405f8 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -13,8 +13,6 @@ from bdfr.site_downloaders.gfycat import Gfycat @pytest.mark.parametrize(('test_url', 'expected_url'), ( ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), - ('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'), - ('https://gfycat.com/CornyLoathsomeHarrierhawk', 'https://thumbs2.redgifs.com/CornyLoathsomeHarrierhawk.mp4') )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 476149f..097fbf4 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -15,10 +15,8 @@ from bdfr.site_downloaders.redgifs import Redgifs 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), ('https://redgifs.com/watch/springgreendecisivetaruca', 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', - 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', - 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', + 'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'), )) def test_get_link(test_url: str, expected: str): result = Redgifs._get_link(test_url) @@ -29,9 +27,8 @@ def test_get_link(test_url: str, expected: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock() From 3cdae99490e54bc6eb0da452cce2b3048da10786 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 13:39:49 +1000 Subject: [PATCH 098/150] Implement callbacks for downloading --- bdfr/archiver.py | 6 +- bdfr/downloader.py | 2 +- bdfr/resource.py | 50 +++++++------- bdfr/site_downloaders/direct.py | 2 +- bdfr/site_downloaders/erome.py | 2 +- .../youtubedl_fallback.py | 26 ++++---- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/pornhub.py | 7 +- bdfr/site_downloaders/redgifs.py | 2 +- bdfr/site_downloaders/self_post.py | 2 +- bdfr/site_downloaders/youtube.py | 65 ++++++++++++------- tests/site_downloaders/test_direct.py | 2 +- tests/site_downloaders/test_erome.py | 2 +- tests/site_downloaders/test_gallery.py | 2 +- tests/site_downloaders/test_gfycat.py | 2 +- tests/site_downloaders/test_imgur.py | 2 +- tests/site_downloaders/test_pornhub.py | 2 +- tests/site_downloaders/test_redgifs.py | 2 +- tests/site_downloaders/test_youtube.py | 2 +- tests/test_download_filter.py | 4 +- tests/test_file_name_formatter.py | 10 +-- tests/test_resource.py | 6 +- 23 files changed, 112 insertions(+), 92 deletions(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 74b92e8..d445e8d 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -76,17 +76,17 @@ class Archiver(RedditConnector): logger.info(f'Record for entry item {praw_item.id} written to disk') def _write_entry_json(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.json') + resource = Resource(entry.source, '', lambda: None, '.json') content = json.dumps(entry.compile()) self._write_content_to_disk(resource, content) def _write_entry_xml(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.xml') + resource = Resource(entry.source, '', lambda: None, '.xml') content = dict2xml.dict2xml(entry.compile(), wrap='root') self._write_content_to_disk(resource, content) def _write_entry_yaml(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.yaml') + resource = Resource(entry.source, '', lambda: None, '.yaml') content = yaml.dump(entry.compile()) self._write_content_to_disk(resource, content) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index f4220db..69aa818 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: - res.download(self.args.max_wait_time) + res.download() except errors.BulkDownloaderException as e: logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' f'with downloader {downloader_class.__name__}: {e}') diff --git a/bdfr/resource.py b/bdfr/resource.py index e8f9fd1..8f874ef 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,7 +6,7 @@ import logging import re import time import urllib.parse -from typing import Optional +from typing import Callable, Optional import _hashlib import requests @@ -18,40 +18,44 @@ logger = logging.getLogger(__name__) class Resource: - def __init__(self, source_submission: Submission, url: str, extension: str = None): + def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None): self.source_submission = source_submission self.content: Optional[bytes] = None self.url = url self.hash: Optional[_hashlib.HASH] = None self.extension = extension + self.download_function = download_function if not self.extension: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]: - try: - response = requests.get(url) - if re.match(r'^2\d{2}', str(response.status_code)) and response.content: - return response.content - elif response.status_code in (408, 429): - raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') - else: - raise BulkDownloaderException( - f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: - logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') - time.sleep(current_wait_time) - if current_wait_time < max_wait_time: - current_wait_time += 60 - return Resource.retry_download(url, max_wait_time, current_wait_time) - else: - logger.error(f'Max wait time exceeded for resource at url {url}') - raise + def retry_download(url: str, max_wait_time: int) -> Callable: + def http_download() -> Optional[bytes]: + current_wait_time = 60 + while True: + try: + response = requests.get(url) + if re.match(r'^2\d{2}', str(response.status_code)) and response.content: + return response.content + elif response.status_code in (408, 429): + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + else: + raise BulkDownloaderException( + f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: + logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') + time.sleep(current_wait_time) + if current_wait_time < max_wait_time: + current_wait_time += 60 + else: + logger.error(f'Max wait time exceeded for resource at url {url}') + raise + return http_download - def download(self, max_wait_time: int): + def download(self): if not self.content: try: - content = self.retry_download(self.url, max_wait_time) + content = self.download_function() except requests.exceptions.ConnectionError as e: raise BulkDownloaderException(f'Could not download resource: {e}') except BulkDownloaderException: diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index 106f251..df1a469 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -14,4 +14,4 @@ class Direct(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return [Resource(self.post, self.post.url)] + return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))] diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index bd29ea4..69b9ae3 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link)) + out.append(Resource(self.post, link, Resource.retry_download(link, 300))) return out @staticmethod diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 281182a..6ede405 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -4,7 +4,6 @@ import logging from typing import Optional -import youtube_dl from praw.models import Submission from bdfr.resource import Resource @@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): super(YoutubeDlFallback, self).__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = super()._download_video({}) + out = Resource( + self.post, + self.post.url, + super()._download_video({}), + super().get_video_attributes(self.post.url)['ext'], + ) return [out] @staticmethod def can_handle_link(url: str) -> bool: - yt_logger = logging.getLogger('youtube-dl') - yt_logger.setLevel(logging.CRITICAL) - with youtube_dl.YoutubeDL({ - 'logger': yt_logger, - }) as ydl: - try: - result = ydl.extract_info(url, download=False) - if result: - return True - except Exception as e: - logger.exception(e) - return False - return False + attributes = YoutubeDlFallback.get_video_attributes(url) + if attributes: + return True + else: + return False diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index cd34416..c016d28 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -31,7 +31,7 @@ class Gallery(BaseDownloader): if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') - return [Resource(self.post, url) for url in image_urls] + return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls] @ staticmethod def _get_links(id_dict: list[dict]) -> list[str]: diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 44a62f1..79a1115 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -33,7 +33,7 @@ class Imgur(BaseDownloader): def _compute_image_url(self, image: dict) -> Resource: image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) - return Resource(self.post, image_url) + return Resource(self.post, image_url, Resource.retry_download(image_url, 300)) @staticmethod def _get_data(link: str) -> dict: diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index 6658d7e..c2bc0ad 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -22,5 +22,10 @@ class PornHub(Youtube): 'format': 'best', 'nooverwrites': True, } - out = self._download_video(ytdl_options) + out = Resource( + self.post, + self.post.url, + super()._download_video(ytdl_options), + super().get_video_attributes(self.post.url)['ext'], + ) return [out] diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 9cfec02..d4989e7 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, '.mp4')] + return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')] @staticmethod def _get_link(url: str) -> str: diff --git a/bdfr/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py index cb922ee..6e4ce0e 100644 --- a/bdfr/site_downloaders/self_post.py +++ b/bdfr/site_downloaders/self_post.py @@ -17,7 +17,7 @@ class SelfPost(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = Resource(self.post, self.post.url, '.txt') + out = Resource(self.post, self.post.url, lambda: None, '.txt') out.content = self.export_to_string().encode('utf-8') out.create_hash() return [out] diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 8b93b23..126cb6a 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -3,12 +3,12 @@ import logging import tempfile from pathlib import Path -from typing import Optional +from typing import Callable, Optional import youtube_dl from praw.models import Submission -from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError) +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader @@ -26,32 +26,47 @@ class Youtube(BaseDownloader): 'playlistend': 1, 'nooverwrites': True, } - out = self._download_video(ytdl_options) - return [out] + download_function = self._download_video(ytdl_options) + try: + extension = self.get_video_attributes(self.post.url)['ext'] + except KeyError: + raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}') + res = Resource(self.post, self.post.url, download_function, extension) + return [res] - def _download_video(self, ytdl_options: dict) -> Resource: + def _download_video(self, ytdl_options: dict) -> Callable: yt_logger = logging.getLogger('youtube-dl') yt_logger.setLevel(logging.CRITICAL) ytdl_options['quiet'] = True ytdl_options['logger'] = yt_logger - with tempfile.TemporaryDirectory() as temp_dir: - download_path = Path(temp_dir).resolve() - ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' - try: - with youtube_dl.YoutubeDL(ytdl_options) as ydl: - ydl.download([self.post.url]) - except youtube_dl.DownloadError as e: - raise SiteDownloaderError(f'Youtube download failed: {e}') - downloaded_files = list(download_path.iterdir()) - if len(downloaded_files) > 0: - downloaded_file = downloaded_files[0] - else: - raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") - extension = downloaded_file.suffix - with open(downloaded_file, 'rb') as file: - content = file.read() - out = Resource(self.post, self.post.url, extension) - out.content = content - out.create_hash() - return out + def download() -> bytes: + with tempfile.TemporaryDirectory() as temp_dir: + download_path = Path(temp_dir).resolve() + ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' + try: + with youtube_dl.YoutubeDL(ytdl_options) as ydl: + ydl.download([self.post.url]) + except youtube_dl.DownloadError as e: + raise SiteDownloaderError(f'Youtube download failed: {e}') + + downloaded_files = list(download_path.iterdir()) + if len(downloaded_files) > 0: + downloaded_file = downloaded_files[0] + else: + raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") + with open(downloaded_file, 'rb') as file: + content = file.read() + return content + return download + + @staticmethod + def get_video_attributes(url: str) -> dict: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) + with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl: + try: + result = ydl.extract_info(url, download=False) + return result + except Exception as e: + logger.exception(e) diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py index 790f4c3..56f90fc 100644 --- a/tests/site_downloaders/test_direct.py +++ b/tests/site_downloaders/test_direct.py @@ -21,5 +21,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 84546c4..2918bef 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -49,6 +49,6 @@ def test_download_resource(test_url: str, expected_hashes: tuple[str]): mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - [res.download(120) for res in resources] + [res.download() for res in resources] resource_hashes = [res.hash.hexdigest() for res in resources] assert len(resource_hashes) == len(expected_hashes) diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 4e5d9f1..08eea91 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -62,7 +62,7 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re test_submission = reddit_instance.submission(id=test_submission_id) gallery = Gallery(test_submission) results = gallery.find_resources() - [res.download(120) for res in results] + [res.download() for res in results] hashes = [res.hash.hexdigest() for res in results] assert set(hashes) == expected_hashes diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 3a405f8..981d01d 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -31,5 +31,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 94bd240..bfb7405 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -149,6 +149,6 @@ def test_find_resources(test_url: str, expected_hashes: list[str]): downloader = Imgur(mock_download) results = downloader.find_resources() assert all([isinstance(res, Resource) for res in results]) - [res.download(120) for res in results] + [res.download() for res in results] hashes = set([res.hash.hexdigest() for res in results]) assert hashes == set(expected_hashes) diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index 12144dd..e07da45 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -21,5 +21,5 @@ def test_find_resources_good(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 097fbf4..571f044 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -37,5 +37,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index f3a97e1..1f6b81a 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -23,7 +23,7 @@ def test_find_resources_good(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/test_download_filter.py b/tests/test_download_filter.py index ead2b2f..5def10c 100644 --- a/tests/test_download_filter.py +++ b/tests/test_download_filter.py @@ -46,7 +46,7 @@ def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadF ('http://reddit.com/test.gif', False), )) def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = download_filter.check_resource(test_resource) assert result == expected @@ -59,6 +59,6 @@ def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilt )) def test_filter_empty_filter(test_url: str): download_filter = DownloadFilter() - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = download_filter.check_resource(test_resource) assert result is True diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index e4c82ac..f596d89 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -119,7 +119,7 @@ def test_format_full( format_string_file: str, expected: str, reddit_submission: praw.models.Submission): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test')) assert do_test_path_equality(result, expected) @@ -136,7 +136,7 @@ def test_format_full_conform( format_string_directory: str, format_string_file: str, reddit_submission: praw.models.Submission): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter.format_path(test_resource, Path('test')) @@ -156,7 +156,7 @@ def test_format_full_with_index_suffix( expected: str, reddit_submission: praw.models.Submission, ): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test'), index) assert do_test_path_equality(result, expected) @@ -216,7 +216,7 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path): submission.author.name = 'test' submission.subreddit.display_name = 'test' submission.id = 'BBBBBB' - test_resource = Resource(submission, 'www.example.com/empty', '.jpeg') + test_resource = Resource(submission, 'www.example.com/empty', lambda: None, '.jpeg') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO') result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) @@ -296,7 +296,7 @@ def test_format_archive_entry_comment( ): test_comment = reddit_instance.comment(id=test_comment_id) test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') - test_entry = Resource(test_comment, '', '.json') + test_entry = Resource(test_comment, '', lambda: None, '.json') result = test_formatter.format_path(test_entry, tmp_path) assert do_test_string_equality(result, expected_name) diff --git a/tests/test_resource.py b/tests/test_resource.py index 272c457..db9a6cc 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -21,7 +21,7 @@ from bdfr.resource import Resource ('https://www.test.com/test/test2/example.png?random=test#thing', '.png'), )) def test_resource_get_extension(test_url: str, expected: str): - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = test_resource._determine_extension() assert result == expected @@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str): ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), )) def test_download_online_resource(test_url: str, expected_hash: str): - test_resource = Resource(MagicMock(), test_url) - test_resource.download(120) + test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url, 60)) + test_resource.download() assert test_resource.hash.hexdigest() == expected_hash From dbe8733fd44cb1b3055faa072c801e73e18d7865 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 14:02:30 +1000 Subject: [PATCH 099/150] Refactor method to remove max wait time --- bdfr/resource.py | 5 ++++- bdfr/site_downloaders/direct.py | 2 +- bdfr/site_downloaders/erome.py | 2 +- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/redgifs.py | 2 +- tests/test_resource.py | 2 +- 7 files changed, 10 insertions(+), 7 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 8f874ef..a1c90de 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,6 +6,7 @@ import logging import re import time import urllib.parse +from collections import namedtuple from typing import Callable, Optional import _hashlib @@ -29,7 +30,9 @@ class Resource: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, max_wait_time: int) -> Callable: + def retry_download(url: str) -> Callable: + max_wait_time = 300 + def http_download() -> Optional[bytes]: current_wait_time = 60 while True: diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index df1a469..833acae 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -14,4 +14,4 @@ class Direct(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))] + return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))] diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 69b9ae3..6130560 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link, Resource.retry_download(link, 300))) + out.append(Resource(self.post, link, Resource.retry_download(link))) return out @staticmethod diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index c016d28..158e338 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -31,7 +31,7 @@ class Gallery(BaseDownloader): if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') - return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls] + return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls] @ staticmethod def _get_links(id_dict: list[dict]) -> list[str]: diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 79a1115..f0b7012 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -33,7 +33,7 @@ class Imgur(BaseDownloader): def _compute_image_url(self, image: dict) -> Resource: image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) - return Resource(self.post, image_url, Resource.retry_download(image_url, 300)) + return Resource(self.post, image_url, Resource.retry_download(image_url)) @staticmethod def _get_data(link: str) -> dict: diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index d4989e7..a62fedb 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')] + return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')] @staticmethod def _get_link(url: str) -> str: diff --git a/tests/test_resource.py b/tests/test_resource.py index db9a6cc..f3bbc9a 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str): ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), )) def test_download_online_resource(test_url: str, expected_hash: str): - test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url, 60)) + test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url)) test_resource.download() assert test_resource.hash.hexdigest() == expected_hash From 7bca303b1b663848c5081fd9fa0543291a05396a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 29 Jul 2021 19:10:10 +1000 Subject: [PATCH 100/150] Add in downloader parameters --- bdfr/downloader.py | 2 +- bdfr/resource.py | 13 +++++++++---- bdfr/site_downloaders/youtube.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 69aa818..70052b2 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: - res.download() + res.download({'max_wait_time': self.args.max_wait_time}) except errors.BulkDownloaderException as e: logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' f'with downloader {downloader_class.__name__}: {e}') diff --git a/bdfr/resource.py b/bdfr/resource.py index a1c90de..27ba84b 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,7 +6,6 @@ import logging import re import time import urllib.parse -from collections import namedtuple from typing import Callable, Optional import _hashlib @@ -33,8 +32,12 @@ class Resource: def retry_download(url: str) -> Callable: max_wait_time = 300 - def http_download() -> Optional[bytes]: + def http_download(download_parameters: dict) -> Optional[bytes]: current_wait_time = 60 + if 'max_wait_time' in download_parameters: + max_wait_time = download_parameters['max_wait_time'] + else: + max_wait_time = 300 while True: try: response = requests.get(url) @@ -55,10 +58,12 @@ class Resource: raise return http_download - def download(self): + def download(self, download_parameters: Optional[dict] = None): + if download_parameters is None: + download_parameters = {} if not self.content: try: - content = self.download_function() + content = self.download_function(download_parameters) except requests.exceptions.ConnectionError as e: raise BulkDownloaderException(f'Could not download resource: {e}') except BulkDownloaderException: diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 126cb6a..a870c2e 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -40,7 +40,7 @@ class Youtube(BaseDownloader): ytdl_options['quiet'] = True ytdl_options['logger'] = yt_logger - def download() -> bytes: + def download(_: dict) -> bytes: with tempfile.TemporaryDirectory() as temp_dir: download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' From 87f283cc98ccb7743cfefd54b063d23142040431 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 3 Sep 2021 19:24:28 +1000 Subject: [PATCH 101/150] Fix backup config location --- bdfr/connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 0e78c8c..78ddc4f 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -184,8 +184,9 @@ class RedditConnector(metaclass=ABCMeta): logger.debug(f'Loading configuration from {path}') break if not self.config_location: - self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] - shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) + with importlib.resources.path('bdfr', 'default_config.cfg') as path: + self.config_location = path + shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') self.cfg_parser.read(self.config_location) From afc2a6416bc08b6009e7f4d27af132cf65705259 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 3 Sep 2021 16:39:00 +1000 Subject: [PATCH 102/150] Add integration test --- tests/integration_tests/test_download_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 305fe99..6fecd73 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -45,6 +45,7 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], + ['-s', 'hentai', '-L', 10, '--search', 'red'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], @@ -55,6 +56,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Added submissions from subreddit ' in result.output + assert 'Downloaded submission' in result.output @pytest.mark.online From defd6bca77ff2b56e91b289307d12fe422cda524 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:42:18 +1000 Subject: [PATCH 103/150] Tweak test conditions --- tests/test_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 15eede1..a275d9f 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -199,10 +199,9 @@ def test_get_subreddit_normal( @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( (('Python',), 'scraper', 10, 'all', 10), - (('Python',), '', 10, 'all', 10), + (('Python',), '', 10, 'all', 0), (('Python',), 'djsdsgewef', 10, 'all', 0), (('Python',), 'scraper', 10, 'year', 10), - (('Python',), 'scraper', 10, 'hour', 1), )) def test_get_subreddit_search( test_subreddits: list[str], @@ -226,6 +225,8 @@ def test_get_subreddit_search( assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + if max_expected_len != 0: + assert len(results) > 0 assert not any([isinstance(m, MagicMock) for m in results]) From 56575dc390fbefcbcbadb390e950fdda38561030 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:43:11 +1000 Subject: [PATCH 104/150] Add NSFW search test --- .../test_download_integration.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 6fecd73..57f39bf 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -45,7 +45,6 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], - ['-s', 'hentai', '-L', 10, '--search', 'red'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], @@ -59,6 +58,22 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): assert 'Downloaded submission' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'], +)) +def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from subreddit ' in result.output + assert 'Downloaded submission' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated From edc2db0ded1222b4b050f99421d939fc369ff104 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:50:03 +1000 Subject: [PATCH 105/150] Update test --- tests/site_downloaders/test_erome.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 2918bef..bab34bb 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -14,13 +14,13 @@ from bdfr.site_downloaders.erome import Erome 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', )), ('https://www.erome.com/a/ORhX0FZz', ( - 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + 'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): From 940d646d30299747b6d0a0c3b25ea3fbafed0875 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 11 Sep 2021 12:13:21 +1000 Subject: [PATCH 106/150] Add Vidble module --- bdfr/site_downloaders/vidble.py | 48 +++++++++++++++++++ tests/site_downloaders/test_vidble.py | 67 +++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 bdfr/site_downloaders/vidble.py create mode 100644 tests/site_downloaders/test_vidble.py diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py new file mode 100644 index 0000000..2f8f4f4 --- /dev/null +++ b/bdfr/site_downloaders/vidble.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import itertools +import logging +import re +from typing import Optional + +import bs4 +import requests +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class Vidble(BaseDownloader): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + res = self.get_links(self.post.url) + if not res: + raise SiteDownloaderError(rf'No resources found at {self.post.url}') + res = [Resource(self.post, r, Resource.retry_download(r)) for r in res] + return res + + @staticmethod + def get_links(url: str) -> set[str]: + page = requests.get(url) + soup = bs4.BeautifulSoup(page.text, 'html.parser') + content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'}) + images = content_div.find_all('img') + images = [i.get('src') for i in images] + videos = content_div.find_all('source', attrs={'type': 'video/mp4'}) + videos = [v.get('src') for v in videos] + resources = filter(None, itertools.chain(images, videos)) + resources = ['https://www.vidble.com' + r for r in resources] + resources = [Vidble.change_med_url(r) for r in resources] + return set(resources) + + @staticmethod + def change_med_url(url: str) -> str: + out = re.sub(r'_med(\..{3,4})$', r'\1', url) + return out diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py new file mode 100644 index 0000000..1617bf1 --- /dev/null +++ b/tests/site_downloaders/test_vidble.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# coding=utf-8 +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.vidble import Vidble + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('/RDFbznUvcN_med.jpg', '/RDFbznUvcN.jpg'), +)) +def test_change_med_url(test_url: str, expected: str): + result = Vidble.change_med_url(test_url) + assert result == expected + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.vidble.com/show/UxsvAssYe5', { + 'https://www.vidble.com/UxsvAssYe5.gif', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'https://www.vidble.com/RDFbznUvcN.jpg', + }), + ('https://vidble.com/album/h0jTLs6B', { + 'https://www.vidble.com/XG4eAoJ5JZ.jpg', + 'https://www.vidble.com/IqF5UdH6Uq.jpg', + 'https://www.vidble.com/VWuNsnLJMD.jpg', + 'https://www.vidble.com/sMmM8O650W.jpg', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', + }), +)) +def test_get_links(test_url: str, expected: set[str]): + results = Vidble.get_links(test_url) + assert results == expected + + +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://www.vidble.com/show/UxsvAssYe5', { + '0ef2f8e0e0b45936d2fb3e6fbdf67e28', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'c2dd30a71e32369c50eed86f86efff58', + }), + ('https://vidble.com/album/h0jTLs6B', { + '3b3cba02e01c91f9858a95240b942c71', + 'dd6ecf5fc9e936f9fb614eb6a0537f99', + 'b31a942cd8cdda218ed547bbc04c3a27', + '6f77c570b451eef4222804bd52267481', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'cebe9d5f24dba3b0443e5097f160ca83', + }), +)) +def test_find_resources(test_url: str, expected_hashes: set[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = Vidble(mock_download) + results = downloader.find_resources() + assert all([isinstance(res, Resource) for res in results]) + [res.download() for res in results] + hashes = set([res.hash.hexdigest() for res in results]) + assert hashes == set(expected_hashes) From aee6f4add9a0e89686c194ff8be3723bb3ce24e6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 11 Sep 2021 12:15:35 +1000 Subject: [PATCH 107/150] Add Vidble to download factory --- bdfr/site_downloaders/download_factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 911e8fb..a4e9a6a 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -16,6 +16,7 @@ from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.vidble import Vidble from bdfr.site_downloaders.youtube import Youtube @@ -46,11 +47,12 @@ class DownloadFactory: return Direct elif re.match(r'pornhub\.com.*', sanitised_url): return PornHub + elif re.match(r'vidble\.com', sanitised_url): + return Vidble elif YoutubeDlFallback.can_handle_link(sanitised_url): return YoutubeDlFallback else: - raise NotADownloadableLinkError( - f'No downloader module exists for url {url}') + raise NotADownloadableLinkError(f'No downloader module exists for url {url}') @staticmethod def sanitise_url(url: str) -> str: From 89e24eca62bd7cf5fd6e9e8854f87a03d76f1309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 12 Sep 2021 20:06:51 +0300 Subject: [PATCH 108/150] Bump version to v2.4 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2969fe0..196bd9e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.2.0 +version = 2.4.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 33312687acce22c46c864c14904397e4655eecdd Mon Sep 17 00:00:00 2001 From: Eli Lipsitz Date: Sun, 12 Sep 2021 16:50:31 -0500 Subject: [PATCH 109/150] imgur: download videos as mp4 instead of gif Some imgur URLS have the extension ".gifv" and show up as a gif, even though they're actually supposed to be mp4 videos. Imgur serves all videos/gifs as both .gif and .mp4. The image dict has a key "prefer_video" to distinguish the two. This commit overrides the .gif extension if "prefer_video" is true to ensure we download the submission as originally intended. --- bdfr/site_downloaders/imgur.py | 6 +++++- tests/site_downloaders/test_imgur.py | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index f0b7012..a3e3135 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -32,7 +32,11 @@ class Imgur(BaseDownloader): return out def _compute_image_url(self, image: dict) -> Resource: - image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) + ext = self._validate_extension(image['ext']) + if image.get('prefer_video', False): + ext = '.mp4' + + image_url = 'https://i.imgur.com/' + image['hash'] + ext return Resource(self.post, image_url, Resource.retry_download(image_url)) @staticmethod diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index bfb7405..4c754ec 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -111,7 +111,7 @@ def test_imgur_extension_validation_bad(test_extension: str): ), ( 'https://imgur.com/gallery/IjJJdlC', - ('7227d4312a9779b74302724a0cfa9081',), + ('740b006cf9ec9d6f734b6e8f5130bdab',), ), ( 'https://imgur.com/a/dcc84Gt', @@ -142,6 +142,14 @@ def test_imgur_extension_validation_bad(test_extension: str): 'https://imgur.com/ubYwpbk.GIFV', ('d4a774aac1667783f9ed3a1bd02fac0c',), ), + ( + 'https://i.imgur.com/j1CNCZY.gifv', + ('58e7e6d972058c18b7ecde910ca147e3',), + ), + ( + 'https://i.imgur.com/uTvtQsw.gifv', + ('46c86533aa60fc0e09f2a758513e3ac2',), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 80baab8de7e64e731eb300ade7afbc4474126976 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 14 Sep 2021 13:47:46 +1000 Subject: [PATCH 110/150] Fix bug with different Vidble links --- bdfr/site_downloaders/vidble.py | 8 +++++- tests/site_downloaders/test_vidble.py | 36 ++++++++++++++++----------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index 2f8f4f4..5cea0cb 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -22,7 +22,10 @@ class Vidble(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - res = self.get_links(self.post.url) + try: + res = self.get_links(self.post.url) + except AttributeError: + raise SiteDownloaderError(f'Could not read page at {self.post.url}') if not res: raise SiteDownloaderError(rf'No resources found at {self.post.url}') res = [Resource(self.post, r, Resource.retry_download(r)) for r in res] @@ -30,6 +33,9 @@ class Vidble(BaseDownloader): @staticmethod def get_links(url: str) -> set[str]: + if not re.search(r'vidble.com/(show/|album/|watch\?v)', url): + url = re.sub(r'/(\w*?)$', r'/show/\1', url) + page = requests.get(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'}) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 1617bf1..0c5ebb2 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -33,6 +33,9 @@ def test_change_med_url(test_url: str, expected: str): ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', }), + ('https://www.vidble.com/pHuwWkOcEb', { + 'https://www.vidble.com/pHuwWkOcEb.jpg', + }), )) def test_get_links(test_url: str, expected: set[str]): results = Vidble.get_links(test_url) @@ -40,21 +43,24 @@ def test_get_links(test_url: str, expected: set[str]): @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://www.vidble.com/show/UxsvAssYe5', { - '0ef2f8e0e0b45936d2fb3e6fbdf67e28', - }), - ('https://vidble.com/show/RDFbznUvcN', { - 'c2dd30a71e32369c50eed86f86efff58', - }), - ('https://vidble.com/album/h0jTLs6B', { - '3b3cba02e01c91f9858a95240b942c71', - 'dd6ecf5fc9e936f9fb614eb6a0537f99', - 'b31a942cd8cdda218ed547bbc04c3a27', - '6f77c570b451eef4222804bd52267481', - }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'cebe9d5f24dba3b0443e5097f160ca83', - }), + ('https://www.vidble.com/show/UxsvAssYe5', { + '0ef2f8e0e0b45936d2fb3e6fbdf67e28', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'c2dd30a71e32369c50eed86f86efff58', + }), + ('https://vidble.com/album/h0jTLs6B', { + '3b3cba02e01c91f9858a95240b942c71', + 'dd6ecf5fc9e936f9fb614eb6a0537f99', + 'b31a942cd8cdda218ed547bbc04c3a27', + '6f77c570b451eef4222804bd52267481', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'cebe9d5f24dba3b0443e5097f160ca83', + }), + ('https://www.vidble.com/pHuwWkOcEb', { + '585f486dd0b2f23a57bddbd5bf185bc7', + }), )) def test_find_resources(test_url: str, expected_hashes: set[str]): mock_download = Mock() From 01923fda0e18b58e8667fac9502f9fab3aa1d9fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 14 Sep 2021 21:01:21 +0300 Subject: [PATCH 111/150] Bump version 2.4.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 196bd9e..5792355 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.4.0 +version = 2.4.1 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 327cce5581766c4611d58e7646e485271ea6970c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 2 Oct 2021 12:11:53 +1000 Subject: [PATCH 112/150] Update tests for use with callbacks --- .../fallback_downloaders/test_youtubedl_fallback.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py index f70a91c..f268c0a 100644 --- a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py @@ -22,7 +22,7 @@ def test_can_handle_link(test_url: str, expected: bool): @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), + ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'), @@ -34,4 +34,6 @@ def test_find_resources(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) + for res in resources: + res.download() assert resources[0].hash.hexdigest() == expected_hash From eeb2054606b3956cc032c1d6ecc1e7f6fe18ec7b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 2 Oct 2021 12:23:13 +1000 Subject: [PATCH 113/150] Switch to yt-dlp --- bdfr/site_downloaders/youtube.py | 8 ++++---- requirements.txt | 2 +- tests/site_downloaders/test_pornhub.py | 2 +- tests/site_downloaders/test_youtube.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index a870c2e..ba82007 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -5,7 +5,7 @@ import tempfile from pathlib import Path from typing import Callable, Optional -import youtube_dl +import yt_dlp from praw.models import Submission from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError @@ -45,9 +45,9 @@ class Youtube(BaseDownloader): download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' try: - with youtube_dl.YoutubeDL(ytdl_options) as ydl: + with yt_dlp.YoutubeDL(ytdl_options) as ydl: ydl.download([self.post.url]) - except youtube_dl.DownloadError as e: + except yt_dlp.DownloadError as e: raise SiteDownloaderError(f'Youtube download failed: {e}') downloaded_files = list(download_path.iterdir()) @@ -64,7 +64,7 @@ class Youtube(BaseDownloader): def get_video_attributes(url: str) -> dict: yt_logger = logging.getLogger('youtube-dl') yt_logger.setLevel(logging.CRITICAL) - with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl: + with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: try: result = ydl.extract_info(url, download=False) return result diff --git a/requirements.txt b/requirements.txt index e7b5ff1..8ceffdb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ ffmpeg-python>=0.2.0 praw>=7.2.0 pyyaml>=5.4.1 requests>=2.25.1 -youtube-dl>=2021.3.14 +yt-dlp>=2021.9.25 \ No newline at end of file diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index e07da45..5c220cc 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -12,7 +12,7 @@ from bdfr.site_downloaders.pornhub import PornHub @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'), + ('https://www.pornhub.com/view_video.php?viewkey=ph6074c59798497', 'd9b99e4ebecf2d8d67efe5e70d2acf8a'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 1f6b81a..14c6648 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -13,8 +13,8 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), - ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'), + ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), + ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 9b23f273fc7bfc917325c1eb79f222ee65d5e640 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 2 Oct 2021 12:41:57 +1000 Subject: [PATCH 114/150] Separate function out --- bdfr/resource.py | 55 ++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 27ba84b..68a42e1 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -30,33 +30,7 @@ class Resource: @staticmethod def retry_download(url: str) -> Callable: - max_wait_time = 300 - - def http_download(download_parameters: dict) -> Optional[bytes]: - current_wait_time = 60 - if 'max_wait_time' in download_parameters: - max_wait_time = download_parameters['max_wait_time'] - else: - max_wait_time = 300 - while True: - try: - response = requests.get(url) - if re.match(r'^2\d{2}', str(response.status_code)) and response.content: - return response.content - elif response.status_code in (408, 429): - raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') - else: - raise BulkDownloaderException( - f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: - logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') - time.sleep(current_wait_time) - if current_wait_time < max_wait_time: - current_wait_time += 60 - else: - logger.error(f'Max wait time exceeded for resource at url {url}') - raise - return http_download + return lambda global_params: Resource.http_download(url, global_params) def download(self, download_parameters: Optional[dict] = None): if download_parameters is None: @@ -82,3 +56,30 @@ class Resource: match = re.search(extension_pattern, stripped_url) if match: return match.group(1) + + @staticmethod + def http_download(url: str, download_parameters: dict) -> Optional[bytes]: + headers = download_parameters.get('headers') + current_wait_time = 60 + if 'max_wait_time' in download_parameters: + max_wait_time = download_parameters['max_wait_time'] + else: + max_wait_time = 300 + while True: + try: + response = requests.get(url, headers=headers) + if re.match(r'^2\d{2}', str(response.status_code)) and response.content: + return response.content + elif response.status_code in (408, 429): + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + else: + raise BulkDownloaderException( + f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: + logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') + time.sleep(current_wait_time) + if current_wait_time < max_wait_time: + current_wait_time += 60 + else: + logger.error(f'Max wait time exceeded for resource at url {url}') + raise From c6c6002ab2bbacf4c3440c25b50440d3cda80d16 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 2 Oct 2021 12:52:12 +1000 Subject: [PATCH 115/150] Update Erome module --- bdfr/site_downloaders/erome.py | 15 +++++++++++++-- tests/site_downloaders/test_erome.py | 21 +++++++-------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 6130560..6250415 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -2,7 +2,7 @@ import logging import re -from typing import Optional +from typing import Callable, Optional import bs4 from praw.models import Submission @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link, Resource.retry_download(link))) + out.append(Resource(self.post, link, self.erome_download(link))) return out @staticmethod @@ -43,3 +43,14 @@ class Erome(BaseDownloader): out.extend([vid.get('src') for vid in videos]) return set(out) + + @staticmethod + def erome_download(url: str) -> Callable: + download_parameters = { + 'headers': { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/88.0.4324.104 Safari/537.36', + 'Referer': 'https://www.erome.com/', + }, + } + return lambda global_params: Resource.http_download(url, global_params | download_parameters) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index bab34bb..d27cbb7 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -30,25 +30,18 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): @pytest.mark.online @pytest.mark.slow -@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://www.erome.com/a/vqtPuLXh', { - '5da2a8d60d87bed279431fdec8e7d72f' - }), - ('https://www.erome.com/a/lGrcFxmb', { - '0e98f9f527a911dcedde4f846bb5b69f', - '25696ae364750a5303fc7d7dc78b35c1', - '63775689f438bd393cde7db6d46187de', - 'a1abf398cfd4ef9cfaf093ceb10c746a', - 'bd9e1a4ea5ef0d6ba47fb90e337c2d14' - }), +@pytest.mark.parametrize(('test_url', 'expected_hashes_len'), ( + ('https://www.erome.com/a/vqtPuLXh', 1), + ('https://www.erome.com/a/4tP3KI6F', 1), )) -def test_download_resource(test_url: str, expected_hashes: tuple[str]): +def test_download_resource(test_url: str, expected_hashes_len: int): # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash # will change back and forth randomly mock_submission = MagicMock() mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - [res.download() for res in resources] + for res in resources: + res.download() resource_hashes = [res.hash.hexdigest() for res in resources] - assert len(resource_hashes) == len(expected_hashes) + assert len(resource_hashes) == expected_hashes_len From 4d3f0f986279051ff10460936adfbb45a2d04624 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 2 Oct 2021 13:43:15 +1000 Subject: [PATCH 116/150] Add Youtube test case --- tests/site_downloaders/test_youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 14c6648..684eb20 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -15,6 +15,7 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), + ('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'), # Age restricted )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 03d0aec4f6e9a3f03f675dbf4aba2ef6bbbfea80 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 20 Oct 2021 10:32:23 +1000 Subject: [PATCH 117/150] Increase version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5792355..94ae1de 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.4.1 +version = 2.4.2 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From f716d982b0b75ebbacf7c06a7a006ccc313cea30 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 20 Oct 2021 10:51:28 +1000 Subject: [PATCH 118/150] Update Erome tests --- tests/site_downloaders/test_erome.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index d27cbb7..e06fab5 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 - +import re from unittest.mock import MagicMock import pytest @@ -11,21 +11,21 @@ from bdfr.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( ('https://www.erome.com/a/vqtPuLXh', ( - 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', + r'https://s\d+.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', )), ('https://www.erome.com/a/ORhX0FZz', ( - 'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + r'https://s\d+.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): result = Erome. _get_links(test_url) - assert set(result) == set(expected_urls) + assert all([any([re.match(p, r) for r in result]) for p in expected_urls]) @pytest.mark.online From e493ab048aeb02e45d83520e137e6dece6c78b11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Nov 2021 12:47:46 +1000 Subject: [PATCH 119/150] Fix bug with period not separating file extension --- bdfr/file_name_formatter.py | 5 ++++- tests/test_file_name_formatter.py | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 2fbf95f..e81fe7f 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -110,8 +110,11 @@ class FileNameFormatter: index = f'_{str(index)}' if index else '' if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') - ending = index + resource.extension file_name = str(self._format_name(resource.source_submission, self.file_format_string)) + if not re.match(r'.*\.$',file_name) and not re.match(r'^\..*',resource.extension): + ending = index + '.' + resource.extension + else: + ending = index + resource.extension try: file_path = self._limit_file_name_length(file_name, ending, subfolder) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index f596d89..f9bb2ad 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -13,6 +13,9 @@ import pytest from bdfr.file_name_formatter import FileNameFormatter from bdfr.resource import Resource +from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback +from bdfr.site_downloaders.youtube import Youtube @pytest.fixture() @@ -380,3 +383,23 @@ def test_windows_max_path(tmp_path: Path): result = FileNameFormatter._limit_file_name_length('test' * 100, '_1.png', tmp_path) assert len(str(result)) <= 260 assert len(result.name) <= (260 - len(str(tmp_path))) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( + ('gphmnr', YoutubeDlFallback, {'He has a lot to say today.mp4'}), + ('d0oir2', YoutubeDlFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), +)) +def test_name_submission( + test_reddit_id: str, + test_downloader: type(BaseDownloader), + expected_names: set[str], + reddit_instance: praw.reddit.Reddit, +): + test_submission = reddit_instance.submission(id=test_reddit_id) + test_resources = test_downloader(test_submission).find_resources() + test_formatter = FileNameFormatter('{TITLE}', '', '') + results = test_formatter.format_resource_paths(test_resources, Path('.')) + results = set([r[0].name for r in results]) + assert expected_names == results From 801784c46d7764832f98e988faa804e199003a23 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Nov 2021 13:23:55 +1000 Subject: [PATCH 120/150] Fix a crash when downloading a disabled pornhub video --- bdfr/site_downloaders/pornhub.py | 8 +++++++- tests/site_downloaders/test_pornhub.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index c2bc0ad..748454e 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -6,6 +6,7 @@ from typing import Optional from praw.models import Submission +from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.youtube import Youtube @@ -22,10 +23,15 @@ class PornHub(Youtube): 'format': 'best', 'nooverwrites': True, } + if video_attributes := super().get_video_attributes(self.post.url): + extension = video_attributes['ext'] + else: + raise SiteDownloaderError() + out = Resource( self.post, self.post.url, super()._download_video(ytdl_options), - super().get_video_attributes(self.post.url)['ext'], + extension, ) return [out] diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index 5c220cc..cbe3662 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -5,6 +5,7 @@ from unittest.mock import MagicMock import pytest +from bdfr.exceptions import SiteDownloaderError from bdfr.resource import Resource from bdfr.site_downloaders.pornhub import PornHub @@ -13,6 +14,7 @@ from bdfr.site_downloaders.pornhub import PornHub @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.pornhub.com/view_video.php?viewkey=ph6074c59798497', 'd9b99e4ebecf2d8d67efe5e70d2acf8a'), + ('https://www.pornhub.com/view_video.php?viewkey=ph5ede121f0d3f8', ''), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() @@ -23,3 +25,15 @@ def test_find_resources_good(test_url: str, expected_hash: str): assert isinstance(resources[0], Resource) resources[0].download() assert resources[0].hash.hexdigest() == expected_hash + + +@pytest.mark.online +@pytest.mark.parametrize('test_url', ( + 'https://www.pornhub.com/view_video.php?viewkey=ph5ede121f0d3f8', +)) +def test_find_resources_good(test_url: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = PornHub(test_submission) + with pytest.raises(SiteDownloaderError): + downloader.find_resources() From 4be0f5ec190df4d3bc9d9672a45e01ff49600a41 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Nov 2021 11:57:54 +1000 Subject: [PATCH 121/150] Add more tests for file length checking --- bdfr/file_name_formatter.py | 12 ++++++------ tests/test_file_name_formatter.py | 25 ++++++++++++++++++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index e81fe7f..542a722 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -111,19 +111,19 @@ class FileNameFormatter: if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') file_name = str(self._format_name(resource.source_submission, self.file_format_string)) - if not re.match(r'.*\.$',file_name) and not re.match(r'^\..*',resource.extension): + if not re.match(r'.*\.$', file_name) and not re.match(r'^\..*', resource.extension): ending = index + '.' + resource.extension else: ending = index + resource.extension try: - file_path = self._limit_file_name_length(file_name, ending, subfolder) + file_path = self.limit_file_name_length(file_name, ending, subfolder) except TypeError: raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}') return file_path @staticmethod - def _limit_file_name_length(filename: str, ending: str, root: Path) -> Path: + def limit_file_name_length(filename: str, ending: str, root: Path) -> Path: root = root.resolve().expanduser() possible_id = re.search(r'((?:_\w{6})?$)', filename) if possible_id: @@ -133,9 +133,9 @@ class FileNameFormatter: max_length_chars = 255 - len(ending) max_length_bytes = 255 - len(ending.encode('utf-8')) max_path_length = max_path - len(ending) - len(str(root)) - 1 - while len(filename) > max_length_chars or \ - len(filename.encode('utf-8')) > max_length_bytes or \ - len(filename) > max_path_length: + while any([len(filename) > max_length_chars, + len(filename.encode('utf-8')) > max_length_bytes, + len(filename) > max_path_length]): filename = filename[:-1] return Path(root, filename + ending) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index f9bb2ad..97fd851 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -188,7 +188,7 @@ def test_format_multiple_resources(): ('πŸ˜πŸ’•βœ¨' * 100, '_1.png'), )) def test_limit_filename_length(test_filename: str, test_ending: str): - result = FileNameFormatter._limit_file_name_length(test_filename, test_ending, Path('.')) + result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path('.')) assert len(result.name) <= 255 assert len(result.name.encode('utf-8')) <= 255 assert len(str(result)) <= FileNameFormatter.find_max_path_length() @@ -207,15 +207,15 @@ def test_limit_filename_length(test_filename: str, test_ending: str): ('πŸ˜πŸ’•βœ¨' * 100 + '_aaa1aa', '_1.png', '_aaa1aa_1.png'), )) def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str): - result = FileNameFormatter._limit_file_name_length(test_filename, test_ending, Path('.')) + result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path('.')) assert len(result.name) <= 255 assert len(result.name.encode('utf-8')) <= 255 assert result.name.endswith(expected_end) assert len(str(result)) <= FileNameFormatter.find_max_path_length() -def test_shorten_filenames(submission: MagicMock, tmp_path: Path): - submission.title = 'A' * 300 +def test_shorten_filename_real(submission: MagicMock, tmp_path: Path): + submission.title = 'A' * 500 submission.author.name = 'test' submission.subreddit.display_name = 'test' submission.id = 'BBBBBB' @@ -226,6 +226,21 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path): result.touch() +@pytest.mark.parametrize(('test_name', 'test_ending'), ( + ('a', 'b'), + ('a', '_bbbbbb.jpg'), + ('a' * 20, '_bbbbbb.jpg'), + ('a' * 50, '_bbbbbb.jpg'), + ('a' * 500, '_bbbbbb.jpg'), +)) +def test_shorten_path(test_name: str, test_ending: str, tmp_path: Path): + result = FileNameFormatter.limit_file_name_length(test_name, test_ending, tmp_path) + assert len(str(result.name)) <= 255 + assert len(str(result.name).encode('UTF-8')) <= 255 + assert len(str(result.name).encode('cp1252')) <= 255 + assert len(str(result)) <= FileNameFormatter.find_max_path_length() + + @pytest.mark.parametrize(('test_string', 'expected'), ( ('test', 'test'), ('test😍', 'test'), @@ -380,7 +395,7 @@ def test_get_max_path_length(): def test_windows_max_path(tmp_path: Path): with unittest.mock.patch('platform.system', return_value='Windows'): with unittest.mock.patch('bdfr.file_name_formatter.FileNameFormatter.find_max_path_length', return_value=260): - result = FileNameFormatter._limit_file_name_length('test' * 100, '_1.png', tmp_path) + result = FileNameFormatter.limit_file_name_length('test' * 100, '_1.png', tmp_path) assert len(str(result)) <= 260 assert len(result.name) <= (260 - len(str(tmp_path))) From f05e909008a23ee401aed28e8fff7ecaf939c031 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Tue, 9 Nov 2021 19:30:27 -0500 Subject: [PATCH 122/150] Stop videos from being downloaded as images Erroneous .gifv extensions such as .giff or .gift resolve to a static image and are downloaded by the direct downloader. (ex: https://i.imgur.com/OGeVuAe.giff ) --- bdfr/site_downloaders/download_factory.py | 2 +- bdfr/site_downloaders/imgur.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index a4e9a6a..ccff8e7 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -24,7 +24,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url) - if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): + if re.match(r'(i\.)?imgur.*\.gif.*$', sanitised_url): return Imgur elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ not DownloadFactory.is_web_resource(sanitised_url): diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index a3e3135..905581e 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -42,9 +42,9 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: link = link.rstrip('?') - if re.match(r'(?i).*\.gifv$', link): + if re.match(r'(?i).*\.gif.*$', link): link = link.replace('i.imgur', 'imgur') - link = re.sub('(?i)\\.gifv$', '', link) + link = re.sub('(?i)\\.gif.*$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) From bd802df38c13c3ebec0ff6f3a72ef3da26403c29 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:39:12 -0500 Subject: [PATCH 123/150] Update test_imgur.py Adding test for .giff/.gift imgur extension --- tests/site_downloaders/test_imgur.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 4c754ec..85b84c6 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -150,6 +150,14 @@ def test_imgur_extension_validation_bad(test_extension: str): 'https://i.imgur.com/uTvtQsw.gifv', ('46c86533aa60fc0e09f2a758513e3ac2',), ), + ( + 'https://i.imgur.com/OGeVuAe.giff', + ('77389679084d381336f168538793f218',) + ) + ( + 'https://i.imgur.com/OGeVuAe.gift', + ('77389679084d381336f168538793f218',) + ) )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 8c3af7029eb37c19d4e32dd87edcc87f8565f791 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 10 Nov 2021 20:33:58 -0500 Subject: [PATCH 124/150] Update test_imgur.py --- tests/site_downloaders/test_imgur.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 85b84c6..359bdc3 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -153,11 +153,11 @@ def test_imgur_extension_validation_bad(test_extension: str): ( 'https://i.imgur.com/OGeVuAe.giff', ('77389679084d381336f168538793f218',) - ) + ), ( 'https://i.imgur.com/OGeVuAe.gift', ('77389679084d381336f168538793f218',) - ) + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 53562f48737059d64440d3036d141aa33c55c0e0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Nov 2021 11:03:12 +1000 Subject: [PATCH 125/150] Fix regex --- bdfr/site_downloaders/download_factory.py | 2 +- bdfr/site_downloaders/imgur.py | 4 ++-- tests/site_downloaders/test_download_factory.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index ccff8e7..2f4cf9d 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -24,7 +24,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url) - if re.match(r'(i\.)?imgur.*\.gif.*$', sanitised_url): + if re.match(r'(i\.)?imgur.*\.gif.+$', sanitised_url): return Imgur elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ not DownloadFactory.is_web_resource(sanitised_url): diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 905581e..1f669d0 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -42,9 +42,9 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: link = link.rstrip('?') - if re.match(r'(?i).*\.gif.*$', link): + if re.match(r'(?i).*\.gif.+$', link): link = link.replace('i.imgur', 'imgur') - link = re.sub('(?i)\\.gif.*$', '', link) + link = re.sub('(?i)\\.gif.+$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 95b522d..15466cb 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -30,6 +30,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://imgur.com/BuzvZwb.gifv', Imgur), ('https://i.imgur.com/6fNdLst.gif', Direct), ('https://imgur.com/a/MkxAzeg', Imgur), + ('https://i.imgur.com/OGeVuAe.giff', Imgur), ('https://www.reddit.com/gallery/lu93m7', Gallery), ('https://gfycat.com/concretecheerfulfinwhale', Gfycat), ('https://www.erome.com/a/NWGw0F09', Erome), From 17939fe47ce0c7371f88316e574f5439b94b90d1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 11:48:29 +1000 Subject: [PATCH 126/150] Fix bug with youtube class and children --- .../fallback_downloaders/youtubedl_fallback.py | 8 +++++--- bdfr/site_downloaders/youtube.py | 11 ++++++----- .../fallback_downloaders/test_youtubedl_fallback.py | 11 +++++++++++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 6ede405..d8753bd 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -6,6 +6,7 @@ from typing import Optional from praw.models import Submission +from bdfr.exceptions import NotADownloadableLinkError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.fallback_downloaders.fallback_downloader import BaseFallbackDownloader @@ -29,8 +30,9 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): @staticmethod def can_handle_link(url: str) -> bool: - attributes = YoutubeDlFallback.get_video_attributes(url) + try: + attributes = YoutubeDlFallback.get_video_attributes(url) + except NotADownloadableLinkError: + return False if attributes: return True - else: - return False diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index ba82007..f18f405 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -27,10 +27,7 @@ class Youtube(BaseDownloader): 'nooverwrites': True, } download_function = self._download_video(ytdl_options) - try: - extension = self.get_video_attributes(self.post.url)['ext'] - except KeyError: - raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}') + extension = self.get_video_attributes(self.post.url)['ext'] res = Resource(self.post, self.post.url, download_function, extension) return [res] @@ -67,6 +64,10 @@ class Youtube(BaseDownloader): with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: try: result = ydl.extract_info(url, download=False) - return result except Exception as e: logger.exception(e) + raise NotADownloadableLinkError(f'Video info extraction failed for {url}') + if 'ext' in result: + return result + else: + raise NotADownloadableLinkError(f'Video info extraction failed for {url}') diff --git a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py index f268c0a..0590687 100644 --- a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock import pytest +from bdfr.exceptions import NotADownloadableLinkError from bdfr.resource import Resource from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback @@ -13,12 +14,22 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', True), ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), ('https://www.example.com/test', False), + ('https://milesmatrix.bandcamp.com/album/la-boum/', False), )) def test_can_handle_link(test_url: str, expected: bool): result = YoutubeDlFallback.can_handle_link(test_url) assert result == expected +@pytest.mark.online +@pytest.mark.parametrize('test_url', ( + 'https://milesmatrix.bandcamp.com/album/la-boum/', +)) +def test_info_extraction_bad(test_url: str): + with pytest.raises(NotADownloadableLinkError): + YoutubeDlFallback.get_video_attributes(test_url) + + @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( From b4dd89cddce3c6b880d0f25cb2d21e39a453d8d3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 12:10:26 +1000 Subject: [PATCH 127/150] Add section for common command-line tricks --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 89a4e90..9f2ef7d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ This is a tool to download submissions or submission data from Reddit. It can be If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate. +Included in this README are a few example Bash tricks to get certain behaviour. For that, see [Common Command Tricks](#common-command-tricks). + ## Installation *Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the requirement. Then, you can install it as such: ```bash @@ -208,6 +210,16 @@ The following options are for the `archive` command specifically. The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both. +## Common Command Tricks + +A common use case is for subreddits/users to be loaded from a file. The BDFR doesn't support this directly but it is simple enough to do through the command-line. Consider a list of usernames to download; they can be passed through to the BDFR with the following command, assuming that the usernames are in a text file: + +```bash +cat users.txt | xargs -L 1 echo --user | xargs -L 50 python3 -m bdfr download +``` + +The part `-L 50` is to make sure that the character limit for a single line isn't exceeded, but may not be necessary. This can also be used to load subreddits from a file, simply exchange `--user` with `--subreddit` and so on. + ## Authentication and Security The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. From fc279705c1691c2a29b1abb43c98755f1663428a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 12:11:58 +1000 Subject: [PATCH 128/150] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f2ef7d..f8f29e9 100644 --- a/README.md +++ b/README.md @@ -332,7 +332,7 @@ The BDFR can be run in multiple instances with multiple configurations, either c Running these scenarios consecutively is done easily, like any single run. Configuration files that differ may be specified with the `--config` option to switch between tokens, for example. Otherwise, almost all configuration for data sources can be specified per-run through the command line. -Running scenarious concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes the program as Windows forbids multiple processes from accessing the same file. +Running scenarios concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes the program as Windows forbids multiple processes from accessing the same file. The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine. From f19171a1b45bed66e4a241d4950dd2ffc39408e1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 12:25:05 +1000 Subject: [PATCH 129/150] Add mention of bash scripts --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index f8f29e9..7bc4f0b 100644 --- a/README.md +++ b/README.md @@ -336,6 +336,10 @@ Running scenarios concurrently (at the same time) however, is more complicated. The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine. +## Manipulating Logfiles + +The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully downloaded IDs, failed IDs, and more besides. + ## List of currently supported sources - Direct links (links leading to a file) From 6dd17c876254844a524269d5c2c40e6dccd965af Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 16:17:43 +1000 Subject: [PATCH 130/150] Remove unused import --- tests/test_file_name_formatter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 97fd851..29ee50f 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -15,7 +15,6 @@ from bdfr.file_name_formatter import FileNameFormatter from bdfr.resource import Resource from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback -from bdfr.site_downloaders.youtube import Youtube @pytest.fixture() From 2dd446a402a6cfe848d6fd42dd80b93256b6ee0c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Nov 2021 13:14:28 +1000 Subject: [PATCH 131/150] Fix max path length calculations --- bdfr/file_name_formatter.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 542a722..3e8832b 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -130,14 +130,19 @@ class FileNameFormatter: ending = possible_id.group(1) + ending filename = filename[:possible_id.start()] max_path = FileNameFormatter.find_max_path_length() - max_length_chars = 255 - len(ending) - max_length_bytes = 255 - len(ending.encode('utf-8')) + max_file_part_length_chars = 255 - len(ending) + max_file_part_length_bytes = 255 - len(ending.encode('utf-8')) max_path_length = max_path - len(ending) - len(str(root)) - 1 - while any([len(filename) > max_length_chars, - len(filename.encode('utf-8')) > max_length_bytes, - len(filename) > max_path_length]): + + out = Path(root, filename + ending) + while any([len(filename) > max_file_part_length_chars, + len(filename.encode('utf-8')) > max_file_part_length_bytes, + len(str(out)) > max_path_length, + ]): filename = filename[:-1] - return Path(root, filename + ending) + out = Path(root, filename + ending) + + return out @staticmethod def find_max_path_length() -> int: From 892564333176e17b919b21470571a789c50c3167 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 10:40:18 +1000 Subject: [PATCH 132/150] Rename module to reflect backend change --- bdfr/site_downloaders/download_factory.py | 6 +++--- .../{youtubedl_fallback.py => ytdlp_fallback.py} | 6 +++--- ...st_youtubedl_fallback.py => test_ytdlp_fallback.py} | 8 ++++---- tests/site_downloaders/test_download_factory.py | 10 +++++----- tests/test_file_name_formatter.py | 6 +++--- 5 files changed, 18 insertions(+), 18 deletions(-) rename bdfr/site_downloaders/fallback_downloaders/{youtubedl_fallback.py => ytdlp_fallback.py} (84%) rename tests/site_downloaders/fallback_downloaders/{test_youtubedl_fallback.py => test_ytdlp_fallback.py} (86%) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 2f4cf9d..91489a0 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -9,7 +9,7 @@ from bdfr.exceptions import NotADownloadableLinkError from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome -from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback +from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur @@ -49,8 +49,8 @@ class DownloadFactory: return PornHub elif re.match(r'vidble\.com', sanitised_url): return Vidble - elif YoutubeDlFallback.can_handle_link(sanitised_url): - return YoutubeDlFallback + elif YtdlpFallback.can_handle_link(sanitised_url): + return YtdlpFallback else: raise NotADownloadableLinkError(f'No downloader module exists for url {url}') diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py similarity index 84% rename from bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py rename to bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py index d8753bd..1225624 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py @@ -15,9 +15,9 @@ from bdfr.site_downloaders.youtube import Youtube logger = logging.getLogger(__name__) -class YoutubeDlFallback(BaseFallbackDownloader, Youtube): +class YtdlpFallback(BaseFallbackDownloader, Youtube): def __init__(self, post: Submission): - super(YoutubeDlFallback, self).__init__(post) + super(YtdlpFallback, self).__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: out = Resource( @@ -31,7 +31,7 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): @staticmethod def can_handle_link(url: str) -> bool: try: - attributes = YoutubeDlFallback.get_video_attributes(url) + attributes = YtdlpFallback.get_video_attributes(url) except NotADownloadableLinkError: return False if attributes: diff --git a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py similarity index 86% rename from tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py rename to tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 0590687..2c4a4f6 100644 --- a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -6,7 +6,7 @@ import pytest from bdfr.exceptions import NotADownloadableLinkError from bdfr.resource import Resource -from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback +from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback @pytest.mark.online @@ -17,7 +17,7 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub ('https://milesmatrix.bandcamp.com/album/la-boum/', False), )) def test_can_handle_link(test_url: str, expected: bool): - result = YoutubeDlFallback.can_handle_link(test_url) + result = YtdlpFallback.can_handle_link(test_url) assert result == expected @@ -27,7 +27,7 @@ def test_can_handle_link(test_url: str, expected: bool): )) def test_info_extraction_bad(test_url: str): with pytest.raises(NotADownloadableLinkError): - YoutubeDlFallback.get_video_attributes(test_url) + YtdlpFallback.get_video_attributes(test_url) @pytest.mark.online @@ -41,7 +41,7 @@ def test_info_extraction_bad(test_url: str): def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() test_submission.url = test_url - downloader = YoutubeDlFallback(test_submission) + downloader = YtdlpFallback(test_submission) resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 15466cb..441b554 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -9,7 +9,7 @@ from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.download_factory import DownloadFactory from bdfr.site_downloaders.erome import Erome -from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback +from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur @@ -42,10 +42,10 @@ from bdfr.site_downloaders.youtube import Youtube ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), - ('https://v.redd.it/9z1dnk3xr5k61', YoutubeDlFallback), - ('https://streamable.com/dt46y', YoutubeDlFallback), - ('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback), - ('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback), + ('https://v.redd.it/9z1dnk3xr5k61', YtdlpFallback), + ('https://streamable.com/dt46y', YtdlpFallback), + ('https://vimeo.com/channels/31259/53576664', YtdlpFallback), + ('http://video.pbs.org/viralplayer/2365173446/', YtdlpFallback), ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 29ee50f..e60ae8d 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -14,7 +14,7 @@ import pytest from bdfr.file_name_formatter import FileNameFormatter from bdfr.resource import Resource from bdfr.site_downloaders.base_downloader import BaseDownloader -from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import YoutubeDlFallback +from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback @pytest.fixture() @@ -402,8 +402,8 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( - ('gphmnr', YoutubeDlFallback, {'He has a lot to say today.mp4'}), - ('d0oir2', YoutubeDlFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), + ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), + ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), )) def test_name_submission( test_reddit_id: str, From 4a864827567bd59c2c473f6d2a08d4bbd46da1a3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 11:07:52 +1000 Subject: [PATCH 133/150] Add skip statement for broken test on windows --- tests/test_file_name_formatter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index e60ae8d..30fac77 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -2,6 +2,7 @@ # coding=utf-8 import platform +import sys import unittest.mock from datetime import datetime from pathlib import Path @@ -213,6 +214,7 @@ def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str assert len(str(result)) <= FileNameFormatter.find_max_path_length() +@pytest.mark.skipif(sys.platform == 'win32', reason='Test broken on windows github') def test_shorten_filename_real(submission: MagicMock, tmp_path: Path): submission.title = 'A' * 500 submission.author.name = 'test' From dd8d74ee25a34cdfe36c271e6a26cc5f6e9ed7d8 Mon Sep 17 00:00:00 2001 From: "Jay R. Wren" Date: Sat, 30 Oct 2021 22:19:46 -0400 Subject: [PATCH 134/150] Add --ignore to ignore user --- README.md | 3 +++ bdfr/__main__.py | 1 + bdfr/archiver.py | 2 ++ bdfr/cloner.py | 2 ++ bdfr/configuration.py | 1 + 5 files changed, 9 insertions(+) diff --git a/README.md b/README.md index 7bc4f0b..3ffef7f 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,9 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names +- `--ignore` + - This will add a user to ignore + - Can be specified multiple times - `--include-id-file` - This will add any submission with the IDs in the files provided - Can be specified multiple times diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 367f8c6..de658de 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -17,6 +17,7 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--ignore-user', type=str, multiple=True, default=None), click.option('--include-id-file', multiple=True, default=None), click.option('--log', type=str, default=None), click.option('--saved', is_flag=True, default=None), diff --git a/bdfr/archiver.py b/bdfr/archiver.py index d445e8d..e51be57 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -28,6 +28,8 @@ class Archiver(RedditConnector): def download(self): for generator in self.reddit_lists: for submission in generator: + if submission.author.name in self.args.ignore_user: + continue logger.debug(f'Attempting to archive submission {submission.id}') self.write_entry(submission) diff --git a/bdfr/cloner.py b/bdfr/cloner.py index 979f50f..c48ae17 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -17,5 +17,7 @@ class RedditCloner(RedditDownloader, Archiver): def download(self): for generator in self.reddit_lists: for submission in generator: + if submission.author.name in self.args.ignore_user: + continue self._download_submission(submission) self.write_entry(submission) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index bc4c541..81fa3e4 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -18,6 +18,7 @@ class Configuration(Namespace): self.exclude_id_file = [] self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.folder_scheme: str = '{SUBREDDIT}' + self.ignore_user = [] self.include_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] From 2b50ee072400226e15ee2526985214ce938a6670 Mon Sep 17 00:00:00 2001 From: "Jay R. Wren" Date: Mon, 1 Nov 2021 09:28:46 -0400 Subject: [PATCH 135/150] add test. fix typos. --- README.md | 2 +- bdfr/cloner.py | 2 -- bdfr/downloader.py | 4 ++++ tests/test_downloader.py | 17 +++++++++++++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3ffef7f..b84aa3d 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names -- `--ignore` +- `--ignore-user` - This will add a user to ignore - Can be specified multiple times - `--include-id-file` diff --git a/bdfr/cloner.py b/bdfr/cloner.py index c48ae17..979f50f 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -17,7 +17,5 @@ class RedditCloner(RedditDownloader, Archiver): def download(self): for generator in self.reddit_lists: for submission in generator: - if submission.author.name in self.args.ignore_user: - continue self._download_submission(submission) self.write_entry(submission) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 70052b2..edfd68e 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -51,6 +51,10 @@ class RedditDownloader(RedditConnector): elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') return + elif submission.author.name in self.args.ignore_user: + logger.debug( + f'Submission {submission.id} in {submission.subreddit.display_name} by {submission.author.name} an ignored user') + return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return diff --git a/tests/test_downloader.py b/tests/test_downloader.py index e5f0a31..0cc8dec 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -200,3 +200,20 @@ def test_download_submission( RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) assert len(folder_contents) == expected_files_len + + +@pytest.mark.parametrize('test_ignore_user', ( + 'alice', +)) +def test_download_ignores_user( + test_ignore_user: str, + mock_function: MagicMock, + downloader_mock: MagicMock, +): + downloader_mock.args.ignore_user = test_ignore_user + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + mock_function.return_value = MagicMock() + mock_function.return_value.__name__ = 'test' + submission.author.name = test_ignore_user + RedditDownloader._download_submission(downloader_mock, submission) + assert mock_function.call_count == 0 From 0eeb4b46dc70fe30f4e2c865ee6578245e5a30f9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 10:48:06 +1000 Subject: [PATCH 136/150] Remove bad test --- tests/test_downloader.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 0cc8dec..e5f0a31 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -200,20 +200,3 @@ def test_download_submission( RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) assert len(folder_contents) == expected_files_len - - -@pytest.mark.parametrize('test_ignore_user', ( - 'alice', -)) -def test_download_ignores_user( - test_ignore_user: str, - mock_function: MagicMock, - downloader_mock: MagicMock, -): - downloader_mock.args.ignore_user = test_ignore_user - submission = downloader_mock.reddit_instance.submission(id='m1hqw6') - mock_function.return_value = MagicMock() - mock_function.return_value.__name__ = 'test' - submission.author.name = test_ignore_user - RedditDownloader._download_submission(downloader_mock, submission) - assert mock_function.call_count == 0 From d0d72c82299be3d3a88a6eca950469ced4fce44b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 10:54:29 +1000 Subject: [PATCH 137/150] Add integration test for downloader option --- bdfr/downloader.py | 3 ++- .../test_download_integration.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index edfd68e..028430f 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -53,7 +53,8 @@ class RedditDownloader(RedditConnector): return elif submission.author.name in self.args.ignore_user: logger.debug( - f'Submission {submission.id} in {submission.subreddit.display_name} by {submission.author.name} an ignored user') + f'Submission {submission.id} in {submission.subreddit.display_name} skipped' + f' due to {submission.author.name} being an ignored user') return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index c2414ba..bd53382 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -337,3 +337,18 @@ def test_cli_download_include_id_file(tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Downloaded submission' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--ignore-user', 'ArjanEgges', '-l', 'm3hxzd'], +)) +def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' not in result.output + assert 'being an ignored user' in result.output From f670b347ae94c2366da6fdab1f2a4e34eeb82249 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 10:58:18 +1000 Subject: [PATCH 138/150] Add integration test for archiver option --- bdfr/archiver.py | 3 +++ .../integration_tests/test_archive_integration.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index e51be57..a2e54e5 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -29,6 +29,9 @@ class Archiver(RedditConnector): for generator in self.reddit_lists: for submission in generator: if submission.author.name in self.args.ignore_user: + logger.debug( + f'Submission {submission.id} in {submission.subreddit.display_name} skipped' + f' due to {submission.author.name} being an ignored user') continue logger.debug(f'Attempting to archive submission {submission.id}') self.write_entry(submission) diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 8cbb2d5..5ef04a6 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -106,3 +106,18 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--ignore-user', 'ArjanEgges', '-l', 'm3hxzd'], +)) +def test_cli_archive_ignore_user(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'being an ignored user' in result.output + assert 'Attempting to archive submission' not in result.output From cc80acd6b5ad67c885d7faaa1656b3bc7abc421e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 24 Nov 2021 13:06:07 +1000 Subject: [PATCH 139/150] Increase version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 94ae1de..e5e244b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.4.2 +version = 2.5.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 9ccc9e68633420c005518c3225bda27811d7d9ab Mon Sep 17 00:00:00 2001 From: dbanon87 <56310477+dbanon87@users.noreply.github.com> Date: Mon, 29 Nov 2021 09:22:21 -0500 Subject: [PATCH 140/150] Update archiver.py --- bdfr/archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index a2e54e5..559dcc1 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -28,10 +28,11 @@ class Archiver(RedditConnector): def download(self): for generator in self.reddit_lists: for submission in generator: - if submission.author.name in self.args.ignore_user: + if (submission.author and submission.author.name in self.args.ignore_user) or \ + (submission.author is None and 'DELETED' in self.args.ignore_user): logger.debug( f'Submission {submission.id} in {submission.subreddit.display_name} skipped' - f' due to {submission.author.name} being an ignored user') + f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') continue logger.debug(f'Attempting to archive submission {submission.id}') self.write_entry(submission) From 1530456cf7b6351b00a1c742d36d78475b6a3c7e Mon Sep 17 00:00:00 2001 From: dbanon87 <56310477+dbanon87@users.noreply.github.com> Date: Mon, 29 Nov 2021 09:23:04 -0500 Subject: [PATCH 141/150] Update downloader.py --- bdfr/downloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 028430f..02f5c68 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -51,10 +51,11 @@ class RedditDownloader(RedditConnector): elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') return - elif submission.author.name in self.args.ignore_user: + elif (submission.author and submission.author.name in self.args.ignore_user) or \ + (submission.author is None and 'DELETED' in self.args.ignore_user): logger.debug( f'Submission {submission.id} in {submission.subreddit.display_name} skipped' - f' due to {submission.author.name} being an ignored user') + f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') From 92dca3bd0ea25b2bfd680998d357ef0cc8002589 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 30 Nov 2021 17:46:10 +1000 Subject: [PATCH 142/150] Increment version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index e5e244b..c476c0e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.5.0 +version = 2.5.1 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 5288b79d1bbd70b0dd29468ec8d5a63afeddf46b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Dec 2021 13:04:11 +1000 Subject: [PATCH 143/150] Add test for time checking --- tests/test_connector.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 35d0b2e..9fe58f2 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 - +from datetime import datetime, timedelta from pathlib import Path from typing import Iterator from unittest.mock import MagicMock @@ -195,6 +195,39 @@ def test_get_subreddit_normal( assert not any([isinstance(m, MagicMock) for m in results]) +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_time', 'test_delta'), ( + ('hour', timedelta(hours=1)), + ('day', timedelta(days=1)), + ('week', timedelta(days=7)), + ('month', timedelta(days=31)), + ('year', timedelta(days=365)), +)) +def test_get_subreddit_time_verification( + test_time: str, + test_delta: timedelta, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, +): + downloader_mock.args.limit = 10 + downloader_mock.args.sort = 'top' + downloader_mock.args.time = test_time + downloader_mock.time_filter = RedditConnector.create_time_filter(downloader_mock) + downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock) + downloader_mock.determine_sort_function.return_value = RedditConnector.determine_sort_function(downloader_mock) + downloader_mock.args.subreddit = ['all'] + downloader_mock.reddit_instance = reddit_instance + results = RedditConnector.get_subreddits(downloader_mock) + results = [sub for res1 in results for sub in res1] + assert all([isinstance(res1, praw.models.Submission) for res1 in results]) + nowtime = datetime.now() + for r in results: + result_time = datetime.fromtimestamp(r.created_utc) + time_diff = nowtime - result_time + assert time_diff < test_delta + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( From 36ff95de6b8eee13c901ce37b545e826fe36a12e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 19 Dec 2021 13:44:24 +1000 Subject: [PATCH 144/150] Add Patreon image support --- bdfr/site_downloaders/download_factory.py | 2 ++ tests/site_downloaders/test_download_factory.py | 1 + tests/site_downloaders/test_gallery.py | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 91489a0..49dba5f 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -33,6 +33,8 @@ class DownloadFactory: return Erome elif re.match(r'reddit\.com/gallery/.*', sanitised_url): return Gallery + elif re.match(r'patreon\.com.*', sanitised_url): + return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat elif re.match(r'(m\.)?imgur.*', sanitised_url): diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 441b554..134396c 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -47,6 +47,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://vimeo.com/channels/31259/53576664', YtdlpFallback), ('http://video.pbs.org/viralplayer/2365173446/', YtdlpFallback), ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), + ('https://www.patreon.com/posts/minecart-track-59346560', Gallery), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 08eea91..e9c401f 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -57,6 +57,14 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): '65163f685fb28c5b776e0e77122718be', '2a337eb5b13c34d3ca3f51b5db7c13e9', }), + ('rb3ub6', { # patreon post + '748a976c6cedf7ea85b6f90e7cb685c7', + '839796d7745e88ced6355504e1f74508', + 'bcdb740367d0f19f97a77e614b48a42d', + '0f230b8c4e5d103d35a773fab9814ec3', + 'e5192d6cb4f84c4f4a658355310bf0f9', + '91cbe172cd8ccbcf049fcea4204eb979', + }) )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From af0a545c163c93090f56a3f1ebafdaef7b1f4c45 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 20 Dec 2021 20:43:09 +1000 Subject: [PATCH 145/150] Catch additional error in Gallery --- bdfr/site_downloaders/gallery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 158e338..eeb9e0f 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -24,7 +24,7 @@ class Gallery(BaseDownloader): except (AttributeError, TypeError): try: image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) - except (AttributeError, IndexError, TypeError): + except (AttributeError, IndexError, TypeError, KeyError): logger.error(f'Could not find gallery data in submission {self.post.id}') logger.exception('Gallery image find failure') raise SiteDownloaderError('No images found in Reddit gallery') From e564870cd670b2b9de85542c5a0fde2ccd48f5d1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 20 Dec 2021 20:50:47 +1000 Subject: [PATCH 146/150] Increase version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c476c0e..198ebe7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.5.1 +version = 2.5.2 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 850faffc29706efc36703363f9baba88b649e331 Mon Sep 17 00:00:00 2001 From: Thayol Date: Wed, 5 Jan 2022 01:17:59 +0100 Subject: [PATCH 147/150] Add PowerShell scripts --- .gitattributes | 2 ++ scripts/extract_failed_ids.ps1 | 21 +++++++++++++++++++++ scripts/extract_successful_ids.ps1 | 21 +++++++++++++++++++++ scripts/print_summary.ps1 | 30 ++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+) create mode 100644 .gitattributes create mode 100644 scripts/extract_failed_ids.ps1 create mode 100644 scripts/extract_successful_ids.ps1 create mode 100644 scripts/print_summary.ps1 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c16e947 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Declare files that will always have CRLF line endings on checkout. +*.ps1 text eol=crlf \ No newline at end of file diff --git a/scripts/extract_failed_ids.ps1 b/scripts/extract_failed_ids.ps1 new file mode 100644 index 0000000..17d96f6 --- /dev/null +++ b/scripts/extract_failed_ids.ps1 @@ -0,0 +1,21 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./failed.txt" +} + +Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output +Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 12 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output diff --git a/scripts/extract_successful_ids.ps1 b/scripts/extract_successful_ids.ps1 new file mode 100644 index 0000000..3dbb315 --- /dev/null +++ b/scripts/extract_successful_ids.ps1 @@ -0,0 +1,21 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./successful.txt" +} + +Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output +Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output diff --git a/scripts/print_summary.ps1 b/scripts/print_summary.ps1 new file mode 100644 index 0000000..5d85b09 --- /dev/null +++ b/scripts/print_summary.ps1 @@ -0,0 +1,30 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./successful.txt" +} + +Write-Host -NoNewline "Downloaded submissions: " +Write-Host (Select-String -Path $file -Pattern "Downloaded submission" -AllMatches).Matches.Count +Write-Host -NoNewline "Failed downloads: " +Write-Host (Select-String -Path $file -Pattern "failed to download submission" -AllMatches).Matches.Count +Write-Host -NoNewline "Files already downloaded: " +Write-Host (Select-String -Path $file -Pattern "already exists, continuing" -AllMatches).Matches.Count +Write-Host -NoNewline "Hard linked submissions: " +Write-Host (Select-String -Path $file -Pattern "Hard link made" -AllMatches).Matches.Count +Write-Host -NoNewline "Excluded submissions: " +Write-Host (Select-String -Path $file -Pattern "in exclusion list" -AllMatches).Matches.Count +Write-Host -NoNewline "Files with existing hash skipped: " +Write-Host (Select-String -Path $file -Pattern "downloaded elsewhere" -AllMatches).Matches.Count +Write-Host -NoNewline "Submissions from excluded subreddits: " +Write-Host (Select-String -Path $file -Pattern "in skip list" -AllMatches).Matches.Count From ac3a8e913df84019b0d6dcd7403d5f9a4e946832 Mon Sep 17 00:00:00 2001 From: Thayol Date: Wed, 5 Jan 2022 13:13:45 +0100 Subject: [PATCH 148/150] Fix wrong offset --- scripts/extract_successful_ids.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_successful_ids.ps1 b/scripts/extract_successful_ids.ps1 index 3dbb315..00722f1 100644 --- a/scripts/extract_successful_ids.ps1 +++ b/scripts/extract_successful_ids.ps1 @@ -16,6 +16,6 @@ else { Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output -Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output From 8ec45a9302dbf420dbfaed382e3d0758be3fd71c Mon Sep 17 00:00:00 2001 From: Thayol Date: Thu, 6 Jan 2022 04:06:46 +0100 Subject: [PATCH 149/150] Fix Bash script: Failed to write --- scripts/extract_failed_ids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index f96bd9a..8addf7e 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -18,6 +18,6 @@ fi grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; - grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $14 }' ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From 3811ec37fb121675a3d5c3007ab96c9c44794144 Mon Sep 17 00:00:00 2001 From: Thayol Date: Thu, 6 Jan 2022 12:16:44 +0100 Subject: [PATCH 150/150] Fix offset and remove substring --- scripts/extract_failed_ids.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_failed_ids.ps1 b/scripts/extract_failed_ids.ps1 index 17d96f6..be2d2cb 100644 --- a/scripts/extract_failed_ids.ps1 +++ b/scripts/extract_failed_ids.ps1 @@ -17,5 +17,5 @@ else { Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output -Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 12 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } >> $output Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output