Add beginning of archiver

2021-03-13 20:18:30 +10:00
parent a93813ca45
commit 959b55a939
8 changed files with 198 additions and 5 deletions
--- a/bulkredditdownloader/archive_entry.py
+++ b/bulkredditdownloader/archive_entry.py
@@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 # coding=utf-8
 import logging
 import praw.models
 logger = logging.getLogger(__name__)
 class ArchiveEntry:
    def __init__(self, submission: praw.models.Submission):
        self.submission = submission
        self.comments: list[dict] = []
        self.post_details: dict = {}
    def compile(self) -> dict:
        self._fill_entry()
        out = self.post_details
        out['comments'] = self.comments
        return out
    def _fill_entry(self):
        self._get_comments()
        self._get_post_details()
    def _get_post_details(self):
        self.post_details = {
            'title': self.submission.title,
            'name': self.submission.name,
            'url': self.submission.url,
            'selftext': self.submission.selftext,
            'score': self.submission.score,
            'upvote_ratio': self.submission.upvote_ratio,
            'permalink': self.submission.permalink,
            'id': self.submission.id,
            'author': self.submission.author.name if self.submission.author else 'DELETED',
            'link_flair_text': self.submission.link_flair_text,
            'num_comments': self.submission.num_comments,
            'over_18': self.submission.over_18,
        }
    def _get_comments(self):
        logger.debug(f'Retrieving full comment tree for submission {self.submission.id}')
        self.submission.comments.replace_more(0)
        for top_level_comment in self.submission.comments:
            self.comments.append(self._convert_comment_to_dict(top_level_comment))
    @staticmethod
    def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict:
        out_dict = {
            'author': in_comment.author.name if in_comment.author else 'DELETED',
            'id': in_comment.id,
            'score': in_comment.score,
            'subreddit': in_comment.subreddit.display_name,
            'submission': in_comment.submission.id,
            'stickied': in_comment.stickied,
            'body': in_comment.body,
            'is_submitter': in_comment.is_submitter,
            'created_utc': in_comment.created_utc,
            'parent_id': in_comment.parent_id,
            'replies': [],
        }
        in_comment.replies.replace_more(0)
        for reply in in_comment.replies:
            out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply))
        return out_dict
--- a/bulkredditdownloader/archiver.py
+++ b/bulkredditdownloader/archiver.py
@@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 # coding=utf-8
 import json
 import logging
 import praw.models
 from bulkredditdownloader.archive_entry import ArchiveEntry
 from bulkredditdownloader.configuration import Configuration
 from bulkredditdownloader.downloader import RedditDownloader
 from bulkredditdownloader.exceptions import ArchiverError
 from bulkredditdownloader.resource import Resource
 logger = logging.getLogger(__name__)
 class Archiver(RedditDownloader):
    def __init__(self, args: Configuration):
        super(Archiver, self).__init__(args)
    def download(self):
        for generator in self.reddit_lists:
            for submission in generator:
                logger.debug(f'Attempting to archive submission {submission.id}')
                self._write_submission(submission)
    def _write_submission(self, submission: praw.models.Submission):
        archive_entry = ArchiveEntry(submission)
        if self.args.format == 'json':
            self._write_submission_json(archive_entry)
        elif self.args.format == 'xml':
            self._write_submission_xml(archive_entry)
        elif self.args.format == 'yaml':
            self._write_submission_yaml(archive_entry)
        else:
            raise ArchiverError(f'Unknown format {self.args.format} given')
        logger.info(f'Record for submission {submission.id} written to disk')
    def _write_submission_json(self, entry: ArchiveEntry):
        resource = Resource(entry.submission, '', '.json')
        file_path = self.file_name_formatter.format_path(resource, self.download_directory)
        with open(file_path, 'w') as file:
            logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}')
            json.dump(entry.compile(), file)
    def _write_submission_xml(self, entry: ArchiveEntry):
        raise NotImplementedError
    def _write_submission_yaml(self, entry: ArchiveEntry):
        raise NotImplementedError
--- a/bulkredditdownloader/configuration.py
+++ b/bulkredditdownloader/configuration.py
@@ -32,6 +32,9 @@ class Configuration(Namespace):
        self.user: Optional[str] = None
        self.verbose: int = 0
        # Archiver-specific options
        self.format = 'json'
    def process_click_arguments(self, context: click.Context):
        for arg_key in context.params.keys():
            if arg_key in vars(self) and context.params[arg_key] is not None:
--- a/bulkredditdownloader/exceptions.py
+++ b/bulkredditdownloader/exceptions.py
@@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError):
    pass
 class ArchiverError(BulkDownloaderException):
    pass
 class SiteDownloaderError(BulkDownloaderException):
    pass
--- a/bulkredditdownloader/file_name_formatter.py
+++ b/bulkredditdownloader/file_name_formatter.py
@@ -43,7 +43,7 @@ class FileNameFormatter:
        result = result.replace('/', '')
        return result
-    def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
+    def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
        subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
        index = f'_{str(index)}' if index else ''
        if not resource.extension:
@@ -70,7 +70,7 @@ class FileNameFormatter:
        out = []
        for i, res in enumerate(resources, start=1):
            logger.log(9, f'Formatting filename with index {i}')
-            out.append((self._format_path(res, destination_directory, i), res))
+            out.append((self.format_path(res, destination_directory, i), res))
        return out
    @ staticmethod
--- a/bulkredditdownloader/tests/test_archive_entry.py
+++ b/bulkredditdownloader/tests/test_archive_entry.py
@@ -0,0 +1,32 @@
 #!/usr/bin/env python3
 # coding=utf-8
 import praw
 import pytest
 from bulkredditdownloader.archive_entry import ArchiveEntry
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
    ('m3reby', 27),
 ))
 def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
    test_submission = reddit_instance.submission(id=test_submission_id)
    test_archive_entry = ArchiveEntry(test_submission)
    test_archive_entry._get_comments()
    assert len(test_archive_entry.comments) >= min_comments
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
    ('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}),
    ('m3kua3', {'author': 'DELETED'}),
 ))
 def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
    test_submission = reddit_instance.submission(id=test_submission_id)
    test_archive_entry = ArchiveEntry(test_submission)
    test_archive_entry._get_post_details()
    assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()])
--- a/bulkredditdownloader/tests/test_archiver.py
+++ b/bulkredditdownloader/tests/test_archiver.py
@@ -0,0 +1,36 @@
 #!/usr/bin/env python3
 # coding=utf-8
 from pathlib import Path
 from unittest.mock import MagicMock
 import praw
 import pytest
 from bulkredditdownloader.archive_entry import ArchiveEntry
 from bulkredditdownloader.archiver import Archiver
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
    'm3reby',
 ))
 def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
    archiver_mock = MagicMock()
    test_path = Path(tmp_path, 'test.json')
    test_submission = reddit_instance.submission(id=test_submission_id)
    archiver_mock.file_name_formatter.format_path.return_value = test_path
    test_entry = ArchiveEntry(test_submission)
    Archiver._write_submission_json(archiver_mock, test_entry)
    assert test_path.exists()
@pytest.mark.skip
 def test_write_submission_xml():
    raise NotImplementedError
@pytest.mark.skip
 def test_write_submission_yaml():
    raise NotImplementedError
--- a/bulkredditdownloader/tests/test_file_name_formatter.py
+++ b/bulkredditdownloader/tests/test_file_name_formatter.py
@@ -88,7 +88,7 @@ def test_format_full(
        reddit_submission: praw.models.Submission):
    test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
    test_formatter = FileNameFormatter(format_string_file, format_string_directory)
-    result = test_formatter._format_path(test_resource, Path('test'))
+    result = test_formatter.format_path(test_resource, Path('test'))
    assert str(result) == expected
@@ -109,7 +109,7 @@ def test_format_full_with_index_suffix(
        reddit_submission: praw.models.Submission):
    test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
    test_formatter = FileNameFormatter(format_string_file, format_string_directory)
-    result = test_formatter._format_path(test_resource, Path('test'), index)
+    result = test_formatter.format_path(test_resource, Path('test'), index)
    assert str(result) == expected
@@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path):
    test_submission.id = 'BBBBBB'
    test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
    test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
-    result = test_formatter._format_path(test_resource, tmp_path)
+    result = test_formatter.format_path(test_resource, tmp_path)
    result.parent.mkdir(parents=True)
    result.touch()