From 959b55a939e04b857f51aeb14571efa887d786cd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 20:18:30 +1000 Subject: [PATCH] Add beginning of archiver --- bulkredditdownloader/archive_entry.py | 67 +++++++++++++++++++ bulkredditdownloader/archiver.py | 51 ++++++++++++++ bulkredditdownloader/configuration.py | 3 + bulkredditdownloader/exceptions.py | 4 ++ bulkredditdownloader/file_name_formatter.py | 4 +- .../tests/test_archive_entry.py | 32 +++++++++ bulkredditdownloader/tests/test_archiver.py | 36 ++++++++++ .../tests/test_file_name_formatter.py | 6 +- 8 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 bulkredditdownloader/archive_entry.py create mode 100644 bulkredditdownloader/archiver.py create mode 100644 bulkredditdownloader/tests/test_archive_entry.py create mode 100644 bulkredditdownloader/tests/test_archiver.py diff --git a/bulkredditdownloader/archive_entry.py b/bulkredditdownloader/archive_entry.py new file mode 100644 index 0000000..a223c66 --- /dev/null +++ b/bulkredditdownloader/archive_entry.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +import praw.models + +logger = logging.getLogger(__name__) + + +class ArchiveEntry: + def __init__(self, submission: praw.models.Submission): + self.submission = submission + self.comments: list[dict] = [] + self.post_details: dict = {} + + def compile(self) -> dict: + self._fill_entry() + out = self.post_details + out['comments'] = self.comments + return out + + def _fill_entry(self): + self._get_comments() + self._get_post_details() + + def _get_post_details(self): + self.post_details = { + 'title': self.submission.title, + 'name': self.submission.name, + 'url': self.submission.url, + 'selftext': self.submission.selftext, + 'score': self.submission.score, + 'upvote_ratio': self.submission.upvote_ratio, + 'permalink': self.submission.permalink, + 'id': self.submission.id, + 'author': self.submission.author.name if self.submission.author else 'DELETED', + 'link_flair_text': self.submission.link_flair_text, + 'num_comments': self.submission.num_comments, + 'over_18': self.submission.over_18, + } + + def _get_comments(self): + logger.debug(f'Retrieving full comment tree for submission {self.submission.id}') + self.submission.comments.replace_more(0) + for top_level_comment in self.submission.comments: + self.comments.append(self._convert_comment_to_dict(top_level_comment)) + + @staticmethod + def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict: + out_dict = { + 'author': in_comment.author.name if in_comment.author else 'DELETED', + 'id': in_comment.id, + 'score': in_comment.score, + 'subreddit': in_comment.subreddit.display_name, + 'submission': in_comment.submission.id, + 'stickied': in_comment.stickied, + 'body': in_comment.body, + 'is_submitter': in_comment.is_submitter, + 'created_utc': in_comment.created_utc, + 'parent_id': in_comment.parent_id, + 'replies': [], + } + in_comment.replies.replace_more(0) + for reply in in_comment.replies: + out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply)) + return out_dict diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py new file mode 100644 index 0000000..a29aaee --- /dev/null +++ b/bulkredditdownloader/archiver.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import json +import logging + +import praw.models + +from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.configuration import Configuration +from bulkredditdownloader.downloader import RedditDownloader +from bulkredditdownloader.exceptions import ArchiverError +from bulkredditdownloader.resource import Resource + +logger = logging.getLogger(__name__) + + +class Archiver(RedditDownloader): + def __init__(self, args: Configuration): + super(Archiver, self).__init__(args) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + logger.debug(f'Attempting to archive submission {submission.id}') + self._write_submission(submission) + + def _write_submission(self, submission: praw.models.Submission): + archive_entry = ArchiveEntry(submission) + if self.args.format == 'json': + self._write_submission_json(archive_entry) + elif self.args.format == 'xml': + self._write_submission_xml(archive_entry) + elif self.args.format == 'yaml': + self._write_submission_yaml(archive_entry) + else: + raise ArchiverError(f'Unknown format {self.args.format} given') + logger.info(f'Record for submission {submission.id} written to disk') + + def _write_submission_json(self, entry: ArchiveEntry): + resource = Resource(entry.submission, '', '.json') + file_path = self.file_name_formatter.format_path(resource, self.download_directory) + with open(file_path, 'w') as file: + logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}') + json.dump(entry.compile(), file) + + def _write_submission_xml(self, entry: ArchiveEntry): + raise NotImplementedError + + def _write_submission_yaml(self, entry: ArchiveEntry): + raise NotImplementedError diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 6633ec2..09d1b8a 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -32,6 +32,9 @@ class Configuration(Namespace): self.user: Optional[str] = None self.verbose: int = 0 + # Archiver-specific options + self.format = 'json' + def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): if arg_key in vars(self) and context.params[arg_key] is not None: diff --git a/bulkredditdownloader/exceptions.py b/bulkredditdownloader/exceptions.py index 703ffaa..91fda2c 100644 --- a/bulkredditdownloader/exceptions.py +++ b/bulkredditdownloader/exceptions.py @@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError): pass +class ArchiverError(BulkDownloaderException): + pass + + class SiteDownloaderError(BulkDownloaderException): pass diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index ffae54b..1950306 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -43,7 +43,7 @@ class FileNameFormatter: result = result.replace('/', '') return result - def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: + def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) index = f'_{str(index)}' if index else '' if not resource.extension: @@ -70,7 +70,7 @@ class FileNameFormatter: out = [] for i, res in enumerate(resources, start=1): logger.log(9, f'Formatting filename with index {i}') - out.append((self._format_path(res, destination_directory, i), res)) + out.append((self.format_path(res, destination_directory, i), res)) return out @ staticmethod diff --git a/bulkredditdownloader/tests/test_archive_entry.py b/bulkredditdownloader/tests/test_archive_entry.py new file mode 100644 index 0000000..dba5732 --- /dev/null +++ b/bulkredditdownloader/tests/test_archive_entry.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.archive_entry import ArchiveEntry + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_comments'), ( + ('m3reby', 27), +)) +def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + test_archive_entry = ArchiveEntry(test_submission) + test_archive_entry._get_comments() + assert len(test_archive_entry.comments) >= min_comments + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), ( + ('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}), + ('m3kua3', {'author': 'DELETED'}), +)) +def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + test_archive_entry = ArchiveEntry(test_submission) + test_archive_entry._get_post_details() + assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()]) diff --git a/bulkredditdownloader/tests/test_archiver.py b/bulkredditdownloader/tests/test_archiver.py new file mode 100644 index 0000000..7c497ff --- /dev/null +++ b/bulkredditdownloader/tests/test_archiver.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from pathlib import Path +from unittest.mock import MagicMock + +import praw +import pytest + +from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.archiver import Archiver + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_id', ( + 'm3reby', +)) +def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): + archiver_mock = MagicMock() + test_path = Path(tmp_path, 'test.json') + test_submission = reddit_instance.submission(id=test_submission_id) + archiver_mock.file_name_formatter.format_path.return_value = test_path + test_entry = ArchiveEntry(test_submission) + Archiver._write_submission_json(archiver_mock, test_entry) + assert test_path.exists() + + +@pytest.mark.skip +def test_write_submission_xml(): + raise NotImplementedError + + +@pytest.mark.skip +def test_write_submission_yaml(): + raise NotImplementedError diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index ba8042d..3b79904 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -88,7 +88,7 @@ def test_format_full( reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) - result = test_formatter._format_path(test_resource, Path('test')) + result = test_formatter.format_path(test_resource, Path('test')) assert str(result) == expected @@ -109,7 +109,7 @@ def test_format_full_with_index_suffix( reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) - result = test_formatter._format_path(test_resource, Path('test'), index) + result = test_formatter.format_path(test_resource, Path('test'), index) assert str(result) == expected @@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path): test_submission.id = 'BBBBBB' test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') - result = test_formatter._format_path(test_resource, tmp_path) + result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) result.touch()