Add beginning of archiver

This commit is contained in:
Serene-Arc
2021-03-13 20:18:30 +10:00
committed by Ali Parlakci
parent a93813ca45
commit 959b55a939
8 changed files with 198 additions and 5 deletions

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import praw.models
logger = logging.getLogger(__name__)
class ArchiveEntry:
def __init__(self, submission: praw.models.Submission):
self.submission = submission
self.comments: list[dict] = []
self.post_details: dict = {}
def compile(self) -> dict:
self._fill_entry()
out = self.post_details
out['comments'] = self.comments
return out
def _fill_entry(self):
self._get_comments()
self._get_post_details()
def _get_post_details(self):
self.post_details = {
'title': self.submission.title,
'name': self.submission.name,
'url': self.submission.url,
'selftext': self.submission.selftext,
'score': self.submission.score,
'upvote_ratio': self.submission.upvote_ratio,
'permalink': self.submission.permalink,
'id': self.submission.id,
'author': self.submission.author.name if self.submission.author else 'DELETED',
'link_flair_text': self.submission.link_flair_text,
'num_comments': self.submission.num_comments,
'over_18': self.submission.over_18,
}
def _get_comments(self):
logger.debug(f'Retrieving full comment tree for submission {self.submission.id}')
self.submission.comments.replace_more(0)
for top_level_comment in self.submission.comments:
self.comments.append(self._convert_comment_to_dict(top_level_comment))
@staticmethod
def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict:
out_dict = {
'author': in_comment.author.name if in_comment.author else 'DELETED',
'id': in_comment.id,
'score': in_comment.score,
'subreddit': in_comment.subreddit.display_name,
'submission': in_comment.submission.id,
'stickied': in_comment.stickied,
'body': in_comment.body,
'is_submitter': in_comment.is_submitter,
'created_utc': in_comment.created_utc,
'parent_id': in_comment.parent_id,
'replies': [],
}
in_comment.replies.replace_more(0)
for reply in in_comment.replies:
out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply))
return out_dict

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# coding=utf-8
import json
import logging
import praw.models
from bulkredditdownloader.archive_entry import ArchiveEntry
from bulkredditdownloader.configuration import Configuration
from bulkredditdownloader.downloader import RedditDownloader
from bulkredditdownloader.exceptions import ArchiverError
from bulkredditdownloader.resource import Resource
logger = logging.getLogger(__name__)
class Archiver(RedditDownloader):
def __init__(self, args: Configuration):
super(Archiver, self).__init__(args)
def download(self):
for generator in self.reddit_lists:
for submission in generator:
logger.debug(f'Attempting to archive submission {submission.id}')
self._write_submission(submission)
def _write_submission(self, submission: praw.models.Submission):
archive_entry = ArchiveEntry(submission)
if self.args.format == 'json':
self._write_submission_json(archive_entry)
elif self.args.format == 'xml':
self._write_submission_xml(archive_entry)
elif self.args.format == 'yaml':
self._write_submission_yaml(archive_entry)
else:
raise ArchiverError(f'Unknown format {self.args.format} given')
logger.info(f'Record for submission {submission.id} written to disk')
def _write_submission_json(self, entry: ArchiveEntry):
resource = Resource(entry.submission, '', '.json')
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
with open(file_path, 'w') as file:
logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}')
json.dump(entry.compile(), file)
def _write_submission_xml(self, entry: ArchiveEntry):
raise NotImplementedError
def _write_submission_yaml(self, entry: ArchiveEntry):
raise NotImplementedError

View File

@@ -32,6 +32,9 @@ class Configuration(Namespace):
self.user: Optional[str] = None self.user: Optional[str] = None
self.verbose: int = 0 self.verbose: int = 0
# Archiver-specific options
self.format = 'json'
def process_click_arguments(self, context: click.Context): def process_click_arguments(self, context: click.Context):
for arg_key in context.params.keys(): for arg_key in context.params.keys():
if arg_key in vars(self) and context.params[arg_key] is not None: if arg_key in vars(self) and context.params[arg_key] is not None:

View File

@@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError):
pass pass
class ArchiverError(BulkDownloaderException):
pass
class SiteDownloaderError(BulkDownloaderException): class SiteDownloaderError(BulkDownloaderException):
pass pass

View File

@@ -43,7 +43,7 @@ class FileNameFormatter:
result = result.replace('/', '') result = result.replace('/', '')
return result return result
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
index = f'_{str(index)}' if index else '' index = f'_{str(index)}' if index else ''
if not resource.extension: if not resource.extension:
@@ -70,7 +70,7 @@ class FileNameFormatter:
out = [] out = []
for i, res in enumerate(resources, start=1): for i, res in enumerate(resources, start=1):
logger.log(9, f'Formatting filename with index {i}') logger.log(9, f'Formatting filename with index {i}')
out.append((self._format_path(res, destination_directory, i), res)) out.append((self.format_path(res, destination_directory, i), res))
return out return out
@ staticmethod @ staticmethod

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bulkredditdownloader.archive_entry import ArchiveEntry
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
('m3reby', 27),
))
def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
test_archive_entry = ArchiveEntry(test_submission)
test_archive_entry._get_comments()
assert len(test_archive_entry.comments) >= min_comments
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}),
('m3kua3', {'author': 'DELETED'}),
))
def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
test_archive_entry = ArchiveEntry(test_submission)
test_archive_entry._get_post_details()
assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()])

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# coding=utf-8
from pathlib import Path
from unittest.mock import MagicMock
import praw
import pytest
from bulkredditdownloader.archive_entry import ArchiveEntry
from bulkredditdownloader.archiver import Archiver
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
'm3reby',
))
def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
archiver_mock = MagicMock()
test_path = Path(tmp_path, 'test.json')
test_submission = reddit_instance.submission(id=test_submission_id)
archiver_mock.file_name_formatter.format_path.return_value = test_path
test_entry = ArchiveEntry(test_submission)
Archiver._write_submission_json(archiver_mock, test_entry)
assert test_path.exists()
@pytest.mark.skip
def test_write_submission_xml():
raise NotImplementedError
@pytest.mark.skip
def test_write_submission_yaml():
raise NotImplementedError

View File

@@ -88,7 +88,7 @@ def test_format_full(
reddit_submission: praw.models.Submission): reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
test_formatter = FileNameFormatter(format_string_file, format_string_directory) test_formatter = FileNameFormatter(format_string_file, format_string_directory)
result = test_formatter._format_path(test_resource, Path('test')) result = test_formatter.format_path(test_resource, Path('test'))
assert str(result) == expected assert str(result) == expected
@@ -109,7 +109,7 @@ def test_format_full_with_index_suffix(
reddit_submission: praw.models.Submission): reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
test_formatter = FileNameFormatter(format_string_file, format_string_directory) test_formatter = FileNameFormatter(format_string_file, format_string_directory)
result = test_formatter._format_path(test_resource, Path('test'), index) result = test_formatter.format_path(test_resource, Path('test'), index)
assert str(result) == expected assert str(result) == expected
@@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path):
test_submission.id = 'BBBBBB' test_submission.id = 'BBBBBB'
test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg') test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
result = test_formatter._format_path(test_resource, tmp_path) result = test_formatter.format_path(test_resource, tmp_path)
result.parent.mkdir(parents=True) result.parent.mkdir(parents=True)
result.touch() result.touch()