Merge pull request #489 from aliparlakci/development

Update to v2.3.0
This commit is contained in:
Serene
2021-07-17 21:23:26 +10:00
committed by GitHub
44 changed files with 491 additions and 181 deletions

9
.gitmodules vendored Normal file
View File

@@ -0,0 +1,9 @@
[submodule "scripts/tests/bats"]
path = scripts/tests/bats
url = https://github.com/bats-core/bats-core.git
[submodule "scripts/tests/test_helper/bats-assert"]
path = scripts/tests/test_helper/bats-assert
url = https://github.com/bats-core/bats-assert.git
[submodule "scripts/tests/test_helper/bats-support"]
path = scripts/tests/test_helper/bats-support
url = https://github.com/bats-core/bats-support.git

View File

@@ -196,6 +196,9 @@ The following options are for the `archive` command specifically.
- `json` (default)
- `xml`
- `yaml`
- `--comment-context`
- This option will, instead of downloading an individual comment, download the submission that comment is a part of
- May result in a longer run time as it retrieves much more data
### Cloner Options

View File

@@ -50,6 +50,7 @@ _downloader_options = [
_archiver_options = [
click.option('--all-comments', is_flag=True, default=None),
click.option('--comment-context', is_flag=True, default=None),
click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None),
]

View File

@@ -22,6 +22,7 @@ class BaseArchiveEntry(ABC):
'id': in_comment.id,
'score': in_comment.score,
'subreddit': in_comment.subreddit.display_name,
'author_flair': in_comment.author_flair_text,
'submission': in_comment.submission.id,
'stickied': in_comment.stickied,
'body': in_comment.body,

View File

@@ -61,6 +61,9 @@ class Archiver(RedditConnector):
raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}')
def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}')
praw_item = praw_item.submission
archive_entry = self._pull_lever_entry_factory(praw_item)
if self.args.format == 'json':
self._write_entry_json(archive_entry)

View File

@@ -41,8 +41,9 @@ class Configuration(Namespace):
self.verbose: int = 0
# Archiver-specific options
self.format = 'json'
self.all_comments = False
self.format = 'json'
self.comment_context: bool = False
def process_click_arguments(self, context: click.Context):
for arg_key in context.params.keys():

View File

@@ -90,14 +90,11 @@ class RedditConnector(metaclass=ABCMeta):
def read_config(self):
"""Read any cfg values that need to be processed"""
if self.args.max_wait_time is None:
if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'):
self.cfg_parser.set('DEFAULT', 'max_wait_time', '120')
logger.log(9, 'Wrote default download wait time download to config file')
self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time')
self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120)
logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds')
if self.args.time_format is None:
option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO')
if re.match(r'^[ \'\"]*$', option):
if re.match(r'^[\s\'\"]*$', option):
option = 'ISO'
logger.debug(f'Setting datetime format string to {option}')
self.args.time_format = option
@@ -119,7 +116,7 @@ class RedditConnector(metaclass=ABCMeta):
logger.debug('Using authenticated Reddit instance')
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
logger.log(9, 'Commencing OAuth2 authentication')
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
scopes = self.cfg_parser.get('DEFAULT', 'scopes', fallback='identity, history, read, save')
scopes = OAuth2Authenticator.split_scopes(scopes)
oauth2_authenticator = OAuth2Authenticator(
scopes,
@@ -210,7 +207,7 @@ class RedditConnector(metaclass=ABCMeta):
if log_path.exists():
try:
file_handler.doRollover()
except PermissionError as e:
except PermissionError:
logger.critical(
'Cannot rollover logfile, make sure this is the only '
'BDFR process or specify alternate logfile location')
@@ -242,6 +239,9 @@ class RedditConnector(metaclass=ABCMeta):
if self.args.subreddit:
out = []
for reddit in self.split_args_input(self.args.subreddit):
if reddit == 'friends' and self.authenticated is False:
logger.error('Cannot read friends subreddit without an authenticated instance')
continue
try:
reddit = self.reddit_instance.subreddit(reddit)
try:
@@ -394,7 +394,7 @@ class RedditConnector(metaclass=ABCMeta):
@staticmethod
def check_subreddit_status(subreddit: praw.models.Subreddit):
if subreddit.display_name == 'all':
if subreddit.display_name in ('all', 'friends'):
return
try:
assert subreddit.id

View File

@@ -54,6 +54,9 @@ class RedditDownloader(RedditConnector):
elif not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
elif not self.download_filter.check_url(submission.url):
logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')
return
logger.debug(f'Attempting to download submission {submission.id}')
try:
@@ -76,7 +79,7 @@ class RedditDownloader(RedditConnector):
logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')
continue
elif not self.download_filter.check_resource(res):
logger.debug(f'Download filter removed {submission.id} with URL {submission.url}')
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
continue
try:
res.download(self.args.max_wait_time)
@@ -103,7 +106,8 @@ class RedditDownloader(RedditConnector):
logger.debug(f'Written file to {destination}')
except OSError as e:
logger.exception(e)
logger.error(f'Failed to write file to {destination} in submission {submission.id}: {e}')
logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')
return
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
os.utime(destination, (creation_time, creation_time))
self.master_hash_list[resource_hash] = destination

View File

@@ -13,6 +13,7 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.youtube import Youtube
@@ -43,6 +44,8 @@ class DownloadFactory:
return Youtube
elif re.match(r'i\.redd\.it.*', sanitised_url):
return Direct
elif re.match(r'pornhub\.com.*', sanitised_url):
return PornHub
elif YoutubeDlFallback.can_handle_link(sanitised_url):
return YoutubeDlFallback
else:

View File

@@ -1,10 +1,9 @@
#!/usr/bin/env python3
import logging
import re
from typing import Optional
import bs4
import requests
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
@@ -20,21 +19,30 @@ class Gallery(BaseDownloader):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url)
try:
image_urls = self._get_links(self.post.gallery_data['items'])
except AttributeError:
try:
image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
except (AttributeError, IndexError, TypeError):
logger.error(f'Could not find gallery data in submission {self.post.id}')
logger.exception('Gallery image find failure')
raise SiteDownloaderError('No images found in Reddit gallery')
if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls]
@staticmethod
def _get_links(url: str) -> list[str]:
resource_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = Gallery.retrieve_url(url, headers=resource_headers)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
links = [link.get('href') for link in links]
return links
@ staticmethod
def _get_links(id_dict: list[dict]) -> list[str]:
out = []
for item in id_dict:
image_id = item['media_id']
possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg')
for extension in possible_extensions:
test_url = f'https://i.redd.it/{image_id}{extension}'
response = requests.head(test_url)
if response.status_code == 200:
out.append(test_url)
break
return out

View File

@@ -37,9 +37,10 @@ class Imgur(BaseDownloader):
@staticmethod
def _get_data(link: str) -> dict:
if re.match(r'.*\.gifv$', link):
link = link.rstrip('?')
if re.match(r'(?i).*\.gifv$', link):
link = link.replace('i.imgur', 'imgur')
link = link.rstrip('.gifv')
link = re.sub('(?i)\\.gifv$', '', link)
res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'})

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
from typing import Optional
from praw.models import Submission
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.youtube import Youtube
logger = logging.getLogger(__name__)
class PornHub(Youtube):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
ytdl_options = {
'format': 'best',
'nooverwrites': True,
}
out = self._download_video(ytdl_options)
return [out]

View File

@@ -4,7 +4,6 @@ import json
import re
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError

View File

@@ -43,7 +43,6 @@ class Youtube(BaseDownloader):
except youtube_dl.DownloadError as e:
raise SiteDownloaderError(f'Youtube download failed: {e}')
downloaded_file = None
downloaded_files = list(download_path.iterdir())
if len(downloaded_files) > 0:
downloaded_file = downloaded_files[0]

View File

@@ -11,12 +11,13 @@ if [ -n "$2" ]; then
output="$2"
echo "Outputting IDs to $output"
else
output="failed.txt"
output="./failed.txt"
fi
{
grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ;
grep 'Failed to download resource' "$file" | awk '{ print $15 }' ;
grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ;
grep 'Failed to write file' "$file" | awk '{ print $16 }' | rev | cut -c 2- | rev ;
grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ;
grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ;
} >>"$output"

View File

@@ -11,7 +11,7 @@ if [ -n "$2" ]; then
output="$2"
echo "Outputting IDs to $output"
else
output="successful.txt"
output="./successful.txt"
fi
{

13
scripts/tests/README.md Normal file
View File

@@ -0,0 +1,13 @@
# Bash Scripts Testing
The `bats` framework is included and used to test the scripts included, specifically the scripts designed to parse through the logging output. As this involves delicate regex and indexes, it is necessary to test these.
## Running Tests
Running the tests are easy, and can be done with a single command. Once the working directory is this directory, run the following command.
```bash
./bats/bin/bats *.bats
```
This will run all test files that have the `.bats` suffix.

1
scripts/tests/bats Submodule

Submodule scripts/tests/bats added at ce5ca2802f

View File

@@ -0,0 +1 @@
[2021-06-12 12:49:18,452 - bdfr.downloader - DEBUG] - Submission m2601g skipped due to disabled module Direct

View File

@@ -0,0 +1,3 @@
[2021-06-12 11:13:35,665 - bdfr.downloader - ERROR] - Could not download submission nxv3ew: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.447961v1?rss=1
[2021-06-12 11:14:21,958 - bdfr.downloader - ERROR] - Could not download submission nxv3ek: No downloader module exists for url https://alkossegyedit.hu/termek/pluss-macko-poloval-20cm/?feed_id=34832&_unique_id=60c40a1190ccb&utm_source=Reddit&utm_medium=AEAdmin&utm_campaign=Poster
[2021-06-12 11:17:53,456 - bdfr.downloader - ERROR] - Could not download submission nxv3ea: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.448067v1?rss=1

View File

@@ -0,0 +1,2 @@
[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404

View File

@@ -0,0 +1,2 @@
[2021-06-12 08:38:35,657 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxr7x9: No images found in Reddit gallery
[2021-06-12 08:47:22,005 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxpn0h: Server responded with 503 to https://www.reddit.com/gallery/nxpkvh

View File

@@ -0,0 +1 @@
[2021-06-09 22:01:04,530 - bdfr.downloader - ERROR] - Failed to write file in submission nnboza to C:\Users\Yoga 14\path\to\output\ThotNetwork\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4: [Errno 2] No such file or directory: 'C:\\Users\\Yoga 14\\path\\to\\output\\ThotNetwork\\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4'

View File

@@ -0,0 +1,3 @@
[2021-06-12 08:41:51,464 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxry0l.jpg from submission nxry0l already exists, continuing
[2021-06-12 08:41:51,469 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrlgn.gif from submission nxrlgn already exists, continuing
[2021-06-12 08:41:51,472 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrq9g.png from submission nxrq9g already exists, continuing

View File

@@ -0,0 +1,3 @@
[2021-06-10 20:36:48,722 - bdfr.downloader - DEBUG] - Download filter removed nwfirr with URL https://www.youtube.com/watch?v=NVSiX0Tsees
[2021-06-12 19:56:36,848 - bdfr.downloader - DEBUG] - Download filter removed nwfgcl with URL https://www.reddit.com/r/MaliciousCompliance/comments/nwfgcl/new_guy_decided_to_play_manager_alright/
[2021-06-12 19:56:28,587 - bdfr.downloader - DEBUG] - Download filter removed nxuxjy with URL https://www.reddit.com/r/MaliciousCompliance/comments/nxuxjy/you_want_an_omelette_with_nothing_inside_okay/

View File

@@ -0,0 +1,7 @@
[2021-06-12 11:58:53,864 - bdfr.downloader - INFO] - Downloaded submission nxui9y from tumblr
[2021-06-12 11:58:56,618 - bdfr.downloader - INFO] - Downloaded submission nxsr4r from tumblr
[2021-06-12 11:58:59,026 - bdfr.downloader - INFO] - Downloaded submission nxviir from tumblr
[2021-06-12 11:59:00,289 - bdfr.downloader - INFO] - Downloaded submission nxusva from tumblr
[2021-06-12 11:59:00,735 - bdfr.downloader - INFO] - Downloaded submission nxvko7 from tumblr
[2021-06-12 11:59:01,215 - bdfr.downloader - INFO] - Downloaded submission nxvd63 from tumblr
[2021-06-12 11:59:13,891 - bdfr.downloader - INFO] - Downloaded submission nn9cor from tumblr

View File

@@ -0,0 +1 @@
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Hard link made linking /media/smaug/private/reddit/tumblr/nwnp2n.jpg to /media/smaug/private/reddit/tumblr/nwskqb.jpg in submission nwnp2n

View File

@@ -0,0 +1 @@
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Resource hash aaaaaaaaaaaaaaaaaaaaaaa from submission n86jk8 downloaded elsewhere

View File

@@ -0,0 +1,43 @@
setup() {
load ./test_helper/bats-support/load
load ./test_helper/bats-assert/load
}
teardown() {
rm -f failed.txt
}
@test "fail run no logfile" {
run ../extract_failed_ids.sh
assert_failure
}
@test "fail no downloader module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
@test "fail resource error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
@test "fail site downloader error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
@test "fail failed file write" {
run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
@test "fail disabled module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

View File

@@ -0,0 +1,38 @@
setup() {
load ./test_helper/bats-support/load
load ./test_helper/bats-assert/load
}
teardown() {
rm -f successful.txt
}
@test "success downloaded submission" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}
@test "success resource hash" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}
@test "success download filter" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}
@test "success already exists" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}
@test "success hard link" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

View File

@@ -0,0 +1,2 @@
#!/usr/bin/env python3
# coding=utf-8

View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python3
# coding=utf-8
import re
import shutil
from pathlib import Path
import pytest
from click.testing import CliRunner
from bdfr.__main__ import cli
does_test_config_exist = Path('../test_config.cfg').exists()
def copy_test_config(run_path: Path):
shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg'))
def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path):
copy_test_config(run_path)
out = [
'archive',
str(run_path),
'-v',
'--config', str(Path(run_path, '../test_config.cfg')),
'--log', str(Path(run_path, 'test_log.txt')),
] + test_args
return out
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'gstd4hk'],
['-l', 'm2601g', '-f', 'yaml'],
['-l', 'n60t4c', '-f', 'xml'],
))
def test_cli_archive_single(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'Mindustry', '-L', 25],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'],
['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'],
))
def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'me', '--authenticate', '--all-comments', '-L', '10'],
['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'],
))
def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--comment-context', '--link', 'gxqapql'],
))
def test_cli_archive_full_context(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Converting comment' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'all', '-L', 100],
['--subreddit', 'all', '-L', 100, '--sort', 'new'],
))
def test_cli_archive_long(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
# coding=utf-8
import shutil
from pathlib import Path
import pytest
from click.testing import CliRunner
from bdfr.__main__ import cli
does_test_config_exist = Path('../test_config.cfg').exists()
def copy_test_config(run_path: Path):
shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg'))
def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path):
out = [
'clone',
str(tmp_path),
'-v',
'--config', 'test_config.cfg',
'--log', str(Path(tmp_path, 'test_log.txt')),
] + test_args
return out
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g'],
['-s', 'TrollXChromosomes/', '-L', 1],
))
def test_cli_scrape_general(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_cloner_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Downloaded submission' in result.output
assert 'Record for entry item' in result.output

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# coding=utf-8
import re
import shutil
from pathlib import Path
import pytest
@@ -9,37 +9,20 @@ from click.testing import CliRunner
from bdfr.__main__ import cli
does_test_config_exist = Path('test_config.cfg').exists()
does_test_config_exist = Path('../test_config.cfg').exists()
def create_basic_args_for_download_runner(test_args: list[str], tmp_path: Path):
def copy_test_config(run_path: Path):
shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg'))
def create_basic_args_for_download_runner(test_args: list[str], run_path: Path):
copy_test_config(run_path)
out = [
'download', str(tmp_path),
'download', str(run_path),
'-v',
'--config', 'test_config.cfg',
'--log', str(Path(tmp_path, 'test_log.txt')),
] + test_args
return out
def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path):
out = [
'archive',
str(tmp_path),
'-v',
'--config', 'test_config.cfg',
'--log', str(Path(tmp_path, 'test_log.txt')),
] + test_args
return out
def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path):
out = [
'clone',
str(tmp_path),
'-v',
'--config', 'test_config.cfg',
'--log', str(Path(tmp_path, 'test_log.txt')),
'--config', str(Path(run_path, '../test_config.cfg')),
'--log', str(Path(run_path, 'test_log.txt')),
] + test_args
return out
@@ -74,6 +57,21 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path):
assert 'Added submissions from subreddit ' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.authenticated
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'friends', '-L', 10, '--authenticate'],
))
def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Added submissions from subreddit ' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@@ -163,7 +161,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'python', '-L', 10, '--search-existing'],
['--subreddit', 'python', '-L', 1, '--search-existing'],
))
def test_cli_download_search_existing(test_args: list[str], tmp_path: Path):
Path(tmp_path, 'test.txt').touch()
@@ -180,13 +178,14 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path):
@pytest.mark.parametrize('test_args', (
['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'],
['--subreddit', 'MaliciousCompliance', '-L', '25', '--skip', 'txt'],
['--subreddit', 'tumblr', '-L', '10', '--skip-domain', 'i.redd.it'],
))
def test_cli_download_download_filters(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Download filter removed ' in result.output
assert any((string in result.output for string in ('Download filter removed ', 'filtered due to URL')))
@pytest.mark.online
@@ -203,71 +202,6 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path):
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'gstd4hk'],
['-l', 'm2601g', '-f', 'yaml'],
['-l', 'n60t4c', '-f', 'xml'],
))
def test_cli_archive_single(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'Mindustry', '-L', 25],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'],
['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'],
))
def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'me', '--authenticate', '--all-comments', '-L', '10'],
['--user', 'me', '--user', 'djnish', '--authenticate', '--all-comments', '-L', '10'],
))
def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'all', '-L', 100],
['--subreddit', 'all', '-L', 100, '--sort', 'new'],
))
def test_cli_archive_long(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_archive_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@@ -279,6 +213,7 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path):
['--subreddit', 'submitters', '-L', 10], # Private subreddit
['--subreddit', 'donaldtrump', '-L', 10], # Banned subreddit
['--user', 'djnish', '--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10],
['--subreddit', 'friends', '-L', 10],
))
def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path):
runner = CliRunner()
@@ -371,19 +306,3 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path):
assert result.exit_code == 0
assert 'skipped due to disabled module' in result.output
assert 'Downloaded submission' not in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g'],
['-s', 'TrollXChromosomes/', '-L', 1],
))
def test_cli_scrape_general(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_cloner_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Downloaded submission' in result.output
assert 'Record for entry item' in result.output

View File

@@ -13,6 +13,7 @@ from bdfr.site_downloaders.fallback_downloaders.youtubedl_fallback import Youtub
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.youtube import Youtube
@@ -44,6 +45,7 @@ from bdfr.site_downloaders.youtube import Youtube
('https://streamable.com/dt46y', YoutubeDlFallback),
('https://vimeo.com/channels/31259/53576664', YoutubeDlFallback),
('http://video.pbs.org/viralplayer/2365173446/', YoutubeDlFallback),
('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub),
))
def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit):
result = DownloadFactory.pull_lever(test_submission_url)

View File

@@ -8,30 +8,32 @@ from bdfr.site_downloaders.gallery import Gallery
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://www.reddit.com/gallery/m6lvrh', {
'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8',
'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805',
'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76',
'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444'
@pytest.mark.parametrize(('test_ids', 'expected'), (
([
{'media_id': '18nzv9ch0hn61'},
{'media_id': 'jqkizcch0hn61'},
{'media_id': 'k0fnqzbh0hn61'},
{'media_id': 'm3gamzbh0hn61'},
], {
'https://i.redd.it/18nzv9ch0hn61.jpg',
'https://i.redd.it/jqkizcch0hn61.jpg',
'https://i.redd.it/k0fnqzbh0hn61.jpg',
'https://i.redd.it/m3gamzbh0hn61.jpg'
}),
('https://www.reddit.com/gallery/ljyy27', {
'https://preview.redd.it/04vxj25uqih61.png?width=92&'
'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4',
'https://preview.redd.it/0fnx83kpqih61.png?width=241&'
'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a',
'https://preview.redd.it/7zkmr1wqqih61.png?width=237&'
'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0',
'https://preview.redd.it/u37k5gxrqih61.png?width=443&'
'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862'
([
{'media_id': '04vxj25uqih61'},
{'media_id': '0fnx83kpqih61'},
{'media_id': '7zkmr1wqqih61'},
{'media_id': 'u37k5gxrqih61'},
], {
'https://i.redd.it/04vxj25uqih61.png',
'https://i.redd.it/0fnx83kpqih61.png',
'https://i.redd.it/7zkmr1wqqih61.png',
'https://i.redd.it/u37k5gxrqih61.png'
}),
))
def test_gallery_get_links(test_url: str, expected: set[str]):
results = Gallery._get_links(test_url)
def test_gallery_get_links(test_ids: list[dict], expected: set[str]):
results = Gallery._get_links(test_ids)
assert set(results) == expected
@@ -39,16 +41,24 @@ def test_gallery_get_links(test_url: str, expected: set[str]):
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), (
('m6lvrh', {
'6c8a892ae8066cbe119218bcaac731e1',
'93ce177f8cb7994906795f4615114d13',
'9a293adf19354f14582608cf22124574',
'b73e2c3daee02f99404644ea02f1ae65'
'5c42b8341dd56eebef792e86f3981c6a',
'8f38d76da46f4057bf2773a778e725ca',
'f5776f8f90491c8b770b8e0a6bfa49b3',
'fa1a43c94da30026ad19a9813a0ed2c2',
}),
('ljyy27', {
'1bc38bed88f9c4770e22a37122d5c941',
'2539a92b78f3968a069df2dffe2279f9',
'37dea50281c219b905e46edeefc1a18d',
'ec4924cf40549728dcf53dd40bc7a73c'
'359c203ec81d0bc00e675f1023673238',
'79262fd46bce5bfa550d878a3b898be4',
'808c35267f44acb523ce03bfa5687404',
'ec8b65bdb7f1279c4b3af0ea2bbb30c3',
}),
('nxyahw', {
'b89a3f41feb73ec1136ec4ffa7353eb1',
'cabb76fd6fd11ae6e115a2039eb09f04',
}),
('obkflw', {
'65163f685fb28c5b776e0e77122718be',
'2a337eb5b13c34d3ca3f51b5db7c13e9',
}),
))
def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit):

View File

@@ -130,6 +130,18 @@ def test_imgur_extension_validation_bad(test_extension: str):
'fb6c913d721c0bbb96aa65d7f560d385',
),
),
(
'https://i.imgur.com/lFJai6i.gifv',
('01a6e79a30bec0e644e5da12365d5071',),
),
(
'https://i.imgur.com/ywSyILa.gifv?',
('56d4afc32d2966017c38d98568709b45',),
),
(
'https://imgur.com/ubYwpbk.GIFV',
('d4a774aac1667783f9ed3a1bd02fac0c',),
),
))
def test_find_resources(test_url: str, expected_hashes: list[str]):
mock_download = Mock()

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import MagicMock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.pornhub import PornHub
@pytest.mark.online
@pytest.mark.slow
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'),
))
def test_find_resources_good(test_url: str, expected_hash: str):
test_submission = MagicMock()
test_submission.url = test_url
downloader = PornHub(test_submission)
resources = downloader.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -31,6 +31,7 @@ def test_get_link(test_url: str, expected: str):
('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'),
('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'),
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'),
('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'),
))
def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock()

View File

@@ -14,7 +14,7 @@ from bdfr.site_downloaders.youtube import Youtube
@pytest.mark.slow
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'),
('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'),
('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'),
))
def test_find_resources_good(test_url: str, expected_hash: str):
test_submission = MagicMock()
@@ -28,8 +28,9 @@ def test_find_resources_good(test_url: str, expected_hash: str):
@pytest.mark.online
@pytest.mark.parametrize(('test_url'), (
('https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman-interview-oj-simpson-goliath-chronicles'),
@pytest.mark.parametrize('test_url', (
'https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman'
'-interview-oj-simpson-goliath-chronicles',
))
def test_find_resources_bad(test_url: str):
test_submission = MagicMock()

View File

@@ -29,6 +29,8 @@ def downloader_mock(args: Configuration):
downloader_mock = MagicMock()
downloader_mock.args = args
downloader_mock.sanitise_subreddit_name = RedditConnector.sanitise_subreddit_name
downloader_mock.create_filtered_listing_generator = lambda x: RedditConnector.create_filtered_listing_generator(
downloader_mock, x)
downloader_mock.split_args_input = RedditConnector.split_args_input
downloader_mock.master_hash_list = {}
return downloader_mock
@@ -37,6 +39,7 @@ def downloader_mock(args: Configuration):
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]) -> list:
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) for res in results])
assert not any([isinstance(m, MagicMock) for m in results])
if result_limit is not None:
assert len(results) == result_limit
return results
@@ -167,18 +170,20 @@ def test_get_subreddit_normal(
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
):
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.args.limit = limit
downloader_mock.args.sort = sort_type
downloader_mock.time_filter = RedditConnector.create_time_filter(downloader_mock)
downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock)
downloader_mock.determine_sort_function.return_value = RedditConnector.determine_sort_function(downloader_mock)
downloader_mock.args.subreddit = test_subreddits
downloader_mock.reddit_instance = reddit_instance
downloader_mock.sort_filter = RedditConnector.create_sort_filter(downloader_mock)
results = RedditConnector.get_subreddits(downloader_mock)
test_subreddits = downloader_mock._split_args_input(test_subreddits)
test_subreddits = downloader_mock.split_args_input(test_subreddits)
results = [sub for res1 in results for sub in res1]
assert all([isinstance(res1, praw.models.Submission) for res1 in results])
assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len
assert not any([isinstance(m, MagicMock) for m in results])
@pytest.mark.online
@@ -212,6 +217,7 @@ def test_get_subreddit_search(
assert all([isinstance(res, praw.models.Submission) for res in results])
assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len
assert not any([isinstance(m, MagicMock) for m in results])
@pytest.mark.online
@@ -243,6 +249,7 @@ def test_get_multireddits_public(
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) for res in results])
assert len(results) == limit
assert not any([isinstance(m, MagicMock) for m in results])
@pytest.mark.online
@@ -268,6 +275,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic
results = RedditConnector.get_user_data(downloader_mock)
results = assert_all_results_are_submissions(limit, results)
assert all([res.author.name == test_user for res in results])
assert not any([isinstance(m, MagicMock) for m in results])
@pytest.mark.online

View File

@@ -9,7 +9,6 @@ from unittest.mock import MagicMock, patch
import praw.models
import pytest
import bdfr.site_downloaders.download_factory
from bdfr.__main__ import setup_logging
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector