From 959b55a939e04b857f51aeb14571efa887d786cd Mon Sep 17 00:00:00 2001
From: Serene-Arc <serenical@gmail.com>
Date: Sat, 13 Mar 2021 20:18:30 +1000
Subject: [PATCH] Add beginning of archiver

---
 bulkredditdownloader/archive_entry.py         | 67 +++++++++++++++++++
 bulkredditdownloader/archiver.py              | 51 ++++++++++++++
 bulkredditdownloader/configuration.py         |  3 +
 bulkredditdownloader/exceptions.py            |  4 ++
 bulkredditdownloader/file_name_formatter.py   |  4 +-
 .../tests/test_archive_entry.py               | 32 +++++++++
 bulkredditdownloader/tests/test_archiver.py   | 36 ++++++++++
 .../tests/test_file_name_formatter.py         |  6 +-
 8 files changed, 198 insertions(+), 5 deletions(-)
 create mode 100644 bulkredditdownloader/archive_entry.py
 create mode 100644 bulkredditdownloader/archiver.py
 create mode 100644 bulkredditdownloader/tests/test_archive_entry.py
 create mode 100644 bulkredditdownloader/tests/test_archiver.py

diff --git a/bulkredditdownloader/archive_entry.py b/bulkredditdownloader/archive_entry.py
new file mode 100644
index 0000000..a223c66
--- /dev/null
+++ b/bulkredditdownloader/archive_entry.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import logging
+
+import praw.models
+
+logger = logging.getLogger(__name__)
+
+
+class ArchiveEntry:
+    def __init__(self, submission: praw.models.Submission):
+        self.submission = submission
+        self.comments: list[dict] = []
+        self.post_details: dict = {}
+
+    def compile(self) -> dict:
+        self._fill_entry()
+        out = self.post_details
+        out['comments'] = self.comments
+        return out
+
+    def _fill_entry(self):
+        self._get_comments()
+        self._get_post_details()
+
+    def _get_post_details(self):
+        self.post_details = {
+            'title': self.submission.title,
+            'name': self.submission.name,
+            'url': self.submission.url,
+            'selftext': self.submission.selftext,
+            'score': self.submission.score,
+            'upvote_ratio': self.submission.upvote_ratio,
+            'permalink': self.submission.permalink,
+            'id': self.submission.id,
+            'author': self.submission.author.name if self.submission.author else 'DELETED',
+            'link_flair_text': self.submission.link_flair_text,
+            'num_comments': self.submission.num_comments,
+            'over_18': self.submission.over_18,
+        }
+
+    def _get_comments(self):
+        logger.debug(f'Retrieving full comment tree for submission {self.submission.id}')
+        self.submission.comments.replace_more(0)
+        for top_level_comment in self.submission.comments:
+            self.comments.append(self._convert_comment_to_dict(top_level_comment))
+
+    @staticmethod
+    def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict:
+        out_dict = {
+            'author': in_comment.author.name if in_comment.author else 'DELETED',
+            'id': in_comment.id,
+            'score': in_comment.score,
+            'subreddit': in_comment.subreddit.display_name,
+            'submission': in_comment.submission.id,
+            'stickied': in_comment.stickied,
+            'body': in_comment.body,
+            'is_submitter': in_comment.is_submitter,
+            'created_utc': in_comment.created_utc,
+            'parent_id': in_comment.parent_id,
+            'replies': [],
+        }
+        in_comment.replies.replace_more(0)
+        for reply in in_comment.replies:
+            out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply))
+        return out_dict
diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py
new file mode 100644
index 0000000..a29aaee
--- /dev/null
+++ b/bulkredditdownloader/archiver.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import json
+import logging
+
+import praw.models
+
+from bulkredditdownloader.archive_entry import ArchiveEntry
+from bulkredditdownloader.configuration import Configuration
+from bulkredditdownloader.downloader import RedditDownloader
+from bulkredditdownloader.exceptions import ArchiverError
+from bulkredditdownloader.resource import Resource
+
+logger = logging.getLogger(__name__)
+
+
+class Archiver(RedditDownloader):
+    def __init__(self, args: Configuration):
+        super(Archiver, self).__init__(args)
+
+    def download(self):
+        for generator in self.reddit_lists:
+            for submission in generator:
+                logger.debug(f'Attempting to archive submission {submission.id}')
+                self._write_submission(submission)
+
+    def _write_submission(self, submission: praw.models.Submission):
+        archive_entry = ArchiveEntry(submission)
+        if self.args.format == 'json':
+            self._write_submission_json(archive_entry)
+        elif self.args.format == 'xml':
+            self._write_submission_xml(archive_entry)
+        elif self.args.format == 'yaml':
+            self._write_submission_yaml(archive_entry)
+        else:
+            raise ArchiverError(f'Unknown format {self.args.format} given')
+        logger.info(f'Record for submission {submission.id} written to disk')
+
+    def _write_submission_json(self, entry: ArchiveEntry):
+        resource = Resource(entry.submission, '', '.json')
+        file_path = self.file_name_formatter.format_path(resource, self.download_directory)
+        with open(file_path, 'w') as file:
+            logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}')
+            json.dump(entry.compile(), file)
+
+    def _write_submission_xml(self, entry: ArchiveEntry):
+        raise NotImplementedError
+
+    def _write_submission_yaml(self, entry: ArchiveEntry):
+        raise NotImplementedError
diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py
index 6633ec2..09d1b8a 100644
--- a/bulkredditdownloader/configuration.py
+++ b/bulkredditdownloader/configuration.py
@@ -32,6 +32,9 @@ class Configuration(Namespace):
         self.user: Optional[str] = None
         self.verbose: int = 0
 
+        # Archiver-specific options
+        self.format = 'json'
+
     def process_click_arguments(self, context: click.Context):
         for arg_key in context.params.keys():
             if arg_key in vars(self) and context.params[arg_key] is not None:
diff --git a/bulkredditdownloader/exceptions.py b/bulkredditdownloader/exceptions.py
index 703ffaa..91fda2c 100644
--- a/bulkredditdownloader/exceptions.py
+++ b/bulkredditdownloader/exceptions.py
@@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError):
     pass
 
 
+class ArchiverError(BulkDownloaderException):
+    pass
+
+
 class SiteDownloaderError(BulkDownloaderException):
     pass
 
diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py
index ffae54b..1950306 100644
--- a/bulkredditdownloader/file_name_formatter.py
+++ b/bulkredditdownloader/file_name_formatter.py
@@ -43,7 +43,7 @@ class FileNameFormatter:
         result = result.replace('/', '')
         return result
 
-    def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
+    def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
         subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
         index = f'_{str(index)}' if index else ''
         if not resource.extension:
@@ -70,7 +70,7 @@ class FileNameFormatter:
         out = []
         for i, res in enumerate(resources, start=1):
             logger.log(9, f'Formatting filename with index {i}')
-            out.append((self._format_path(res, destination_directory, i), res))
+            out.append((self.format_path(res, destination_directory, i), res))
         return out
 
     @ staticmethod
diff --git a/bulkredditdownloader/tests/test_archive_entry.py b/bulkredditdownloader/tests/test_archive_entry.py
new file mode 100644
index 0000000..dba5732
--- /dev/null
+++ b/bulkredditdownloader/tests/test_archive_entry.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import praw
+import pytest
+
+from bulkredditdownloader.archive_entry import ArchiveEntry
+
+
+@pytest.mark.online
+@pytest.mark.reddit
+@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
+    ('m3reby', 27),
+))
+def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
+    test_submission = reddit_instance.submission(id=test_submission_id)
+    test_archive_entry = ArchiveEntry(test_submission)
+    test_archive_entry._get_comments()
+    assert len(test_archive_entry.comments) >= min_comments
+
+
+@pytest.mark.online
+@pytest.mark.reddit
+@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
+    ('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}),
+    ('m3kua3', {'author': 'DELETED'}),
+))
+def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
+    test_submission = reddit_instance.submission(id=test_submission_id)
+    test_archive_entry = ArchiveEntry(test_submission)
+    test_archive_entry._get_post_details()
+    assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()])
diff --git a/bulkredditdownloader/tests/test_archiver.py b/bulkredditdownloader/tests/test_archiver.py
new file mode 100644
index 0000000..7c497ff
--- /dev/null
+++ b/bulkredditdownloader/tests/test_archiver.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import praw
+import pytest
+
+from bulkredditdownloader.archive_entry import ArchiveEntry
+from bulkredditdownloader.archiver import Archiver
+
+
+@pytest.mark.online
+@pytest.mark.reddit
+@pytest.mark.parametrize('test_submission_id', (
+    'm3reby',
+))
+def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
+    archiver_mock = MagicMock()
+    test_path = Path(tmp_path, 'test.json')
+    test_submission = reddit_instance.submission(id=test_submission_id)
+    archiver_mock.file_name_formatter.format_path.return_value = test_path
+    test_entry = ArchiveEntry(test_submission)
+    Archiver._write_submission_json(archiver_mock, test_entry)
+    assert test_path.exists()
+
+
+@pytest.mark.skip
+def test_write_submission_xml():
+    raise NotImplementedError
+
+
+@pytest.mark.skip
+def test_write_submission_yaml():
+    raise NotImplementedError
diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py
index ba8042d..3b79904 100644
--- a/bulkredditdownloader/tests/test_file_name_formatter.py
+++ b/bulkredditdownloader/tests/test_file_name_formatter.py
@@ -88,7 +88,7 @@ def test_format_full(
         reddit_submission: praw.models.Submission):
     test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
     test_formatter = FileNameFormatter(format_string_file, format_string_directory)
-    result = test_formatter._format_path(test_resource, Path('test'))
+    result = test_formatter.format_path(test_resource, Path('test'))
     assert str(result) == expected
 
 
@@ -109,7 +109,7 @@ def test_format_full_with_index_suffix(
         reddit_submission: praw.models.Submission):
     test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
     test_formatter = FileNameFormatter(format_string_file, format_string_directory)
-    result = test_formatter._format_path(test_resource, Path('test'), index)
+    result = test_formatter.format_path(test_resource, Path('test'), index)
     assert str(result) == expected
 
 
@@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path):
     test_submission.id = 'BBBBBB'
     test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
     test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
-    result = test_formatter._format_path(test_resource, tmp_path)
+    result = test_formatter.format_path(test_resource, tmp_path)
     result.parent.mkdir(parents=True)
     result.touch()