Add a combined command for the archiver and downloader: clone (#433)

* Simplify downloader function

* Add basic scraper class

* Add "scrape" command

* Rename "scrape" command to "clone"

* Add integration tests for clone command

* Update README

* Fix failing test
This commit is contained in:
Serene
2021-06-06 20:29:09 +10:00
committed by GitHub
parent a2f010c40d
commit 434aeb8feb
6 changed files with 139 additions and 34 deletions

View File

@@ -8,6 +8,7 @@ import click
from bdfr.archiver import Archiver
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
from bdfr.cloner import RedditCloner
logger = logging.getLogger()
@@ -32,11 +33,32 @@ _common_options = [
'controversial', 'rising', 'relevance')), default=None),
]
_downloader_options = [
click.option('--file-scheme', default=None, type=str),
click.option('--folder-scheme', default=None, type=str),
click.option('--make-hard-links', is_flag=True, default=None),
click.option('--max-wait-time', type=int, default=None),
click.option('--no-dupes', is_flag=True, default=None),
click.option('--search-existing', is_flag=True, default=None),
click.option('--exclude-id', default=None, multiple=True),
click.option('--exclude-id-file', default=None, multiple=True),
click.option('--skip', default=None, multiple=True),
click.option('--skip-domain', default=None, multiple=True),
click.option('--skip-subreddit', default=None, multiple=True),
]
def _add_common_options(func):
for opt in _common_options:
func = opt(func)
return func
_archiver_options = [
click.option('--all-comments', is_flag=True, default=None),
click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None),
]
def _add_options(opts: list):
def wrap(func):
for opt in opts:
func = opt(func)
return func
return wrap
@click.group()
@@ -45,18 +67,8 @@ def cli():
@cli.command('download')
@click.option('--file-scheme', default=None, type=str)
@click.option('--folder-scheme', default=None, type=str)
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--max-wait-time', type=int, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
@click.option('--exclude-id', default=None, multiple=True)
@click.option('--exclude-id-file', default=None, multiple=True)
@click.option('--skip', default=None, multiple=True)
@click.option('--skip-domain', default=None, multiple=True)
@click.option('--skip-subreddit', default=None, multiple=True)
@_add_common_options
@_add_options(_common_options)
@_add_options(_downloader_options)
@click.pass_context
def cli_download(context: click.Context, **_):
config = Configuration()
@@ -73,9 +85,8 @@ def cli_download(context: click.Context, **_):
@cli.command('archive')
@_add_common_options
@click.option('--all-comments', is_flag=True, default=None)
@click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None)
@_add_options(_common_options)
@_add_options(_archiver_options)
@click.pass_context
def cli_archive(context: click.Context, **_):
config = Configuration()
@@ -85,7 +96,26 @@ def cli_archive(context: click.Context, **_):
reddit_archiver = Archiver(config)
reddit_archiver.download()
except Exception:
logger.exception('Downloader exited unexpectedly')
logger.exception('Archiver exited unexpectedly')
raise
else:
logger.info('Program complete')
@cli.command('clone')
@_add_options(_common_options)
@_add_options(_archiver_options)
@_add_options(_downloader_options)
@click.pass_context
def cli_clone(context: click.Context, **_):
config = Configuration()
config.process_click_arguments(context)
setup_logging(config.verbose)
try:
reddit_scraper = RedditCloner(config)
reddit_scraper.download()
except Exception:
logger.exception('Scraper exited unexpectedly')
raise
else:
logger.info('Program complete')

21
bdfr/cloner.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
from bdfr.archiver import Archiver
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
logger = logging.getLogger(__name__)
class RedditCloner(RedditDownloader, Archiver):
def __init__(self, args: Configuration):
super(RedditCloner, self).__init__(args)
def download(self):
for generator in self.reddit_lists:
for submission in generator:
self._download_submission(submission)
self.write_entry(submission)

View File

@@ -42,19 +42,20 @@ class RedditDownloader(RedditConnector):
def download(self):
for generator in self.reddit_lists:
for submission in generator:
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
continue
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
else:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
self._download_submission(submission)
def _download_submission(self, submission: praw.models.Submission):
if not isinstance(submission, praw.models.Submission):
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
return
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
return
elif not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
logger.debug(f'Attempting to download submission {submission.id}')
try:
downloader_class = DownloadFactory.pull_lever(submission.url)
downloader = downloader_class(submission)