Add beginning of archiver
This commit is contained in:
67
bulkredditdownloader/archive_entry.py
Normal file
67
bulkredditdownloader/archive_entry.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import logging
|
||||
|
||||
import praw.models
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ArchiveEntry:
|
||||
def __init__(self, submission: praw.models.Submission):
|
||||
self.submission = submission
|
||||
self.comments: list[dict] = []
|
||||
self.post_details: dict = {}
|
||||
|
||||
def compile(self) -> dict:
|
||||
self._fill_entry()
|
||||
out = self.post_details
|
||||
out['comments'] = self.comments
|
||||
return out
|
||||
|
||||
def _fill_entry(self):
|
||||
self._get_comments()
|
||||
self._get_post_details()
|
||||
|
||||
def _get_post_details(self):
|
||||
self.post_details = {
|
||||
'title': self.submission.title,
|
||||
'name': self.submission.name,
|
||||
'url': self.submission.url,
|
||||
'selftext': self.submission.selftext,
|
||||
'score': self.submission.score,
|
||||
'upvote_ratio': self.submission.upvote_ratio,
|
||||
'permalink': self.submission.permalink,
|
||||
'id': self.submission.id,
|
||||
'author': self.submission.author.name if self.submission.author else 'DELETED',
|
||||
'link_flair_text': self.submission.link_flair_text,
|
||||
'num_comments': self.submission.num_comments,
|
||||
'over_18': self.submission.over_18,
|
||||
}
|
||||
|
||||
def _get_comments(self):
|
||||
logger.debug(f'Retrieving full comment tree for submission {self.submission.id}')
|
||||
self.submission.comments.replace_more(0)
|
||||
for top_level_comment in self.submission.comments:
|
||||
self.comments.append(self._convert_comment_to_dict(top_level_comment))
|
||||
|
||||
@staticmethod
|
||||
def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict:
|
||||
out_dict = {
|
||||
'author': in_comment.author.name if in_comment.author else 'DELETED',
|
||||
'id': in_comment.id,
|
||||
'score': in_comment.score,
|
||||
'subreddit': in_comment.subreddit.display_name,
|
||||
'submission': in_comment.submission.id,
|
||||
'stickied': in_comment.stickied,
|
||||
'body': in_comment.body,
|
||||
'is_submitter': in_comment.is_submitter,
|
||||
'created_utc': in_comment.created_utc,
|
||||
'parent_id': in_comment.parent_id,
|
||||
'replies': [],
|
||||
}
|
||||
in_comment.replies.replace_more(0)
|
||||
for reply in in_comment.replies:
|
||||
out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply))
|
||||
return out_dict
|
||||
51
bulkredditdownloader/archiver.py
Normal file
51
bulkredditdownloader/archiver.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
import praw.models
|
||||
|
||||
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||
from bulkredditdownloader.configuration import Configuration
|
||||
from bulkredditdownloader.downloader import RedditDownloader
|
||||
from bulkredditdownloader.exceptions import ArchiverError
|
||||
from bulkredditdownloader.resource import Resource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Archiver(RedditDownloader):
|
||||
def __init__(self, args: Configuration):
|
||||
super(Archiver, self).__init__(args)
|
||||
|
||||
def download(self):
|
||||
for generator in self.reddit_lists:
|
||||
for submission in generator:
|
||||
logger.debug(f'Attempting to archive submission {submission.id}')
|
||||
self._write_submission(submission)
|
||||
|
||||
def _write_submission(self, submission: praw.models.Submission):
|
||||
archive_entry = ArchiveEntry(submission)
|
||||
if self.args.format == 'json':
|
||||
self._write_submission_json(archive_entry)
|
||||
elif self.args.format == 'xml':
|
||||
self._write_submission_xml(archive_entry)
|
||||
elif self.args.format == 'yaml':
|
||||
self._write_submission_yaml(archive_entry)
|
||||
else:
|
||||
raise ArchiverError(f'Unknown format {self.args.format} given')
|
||||
logger.info(f'Record for submission {submission.id} written to disk')
|
||||
|
||||
def _write_submission_json(self, entry: ArchiveEntry):
|
||||
resource = Resource(entry.submission, '', '.json')
|
||||
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
|
||||
with open(file_path, 'w') as file:
|
||||
logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}')
|
||||
json.dump(entry.compile(), file)
|
||||
|
||||
def _write_submission_xml(self, entry: ArchiveEntry):
|
||||
raise NotImplementedError
|
||||
|
||||
def _write_submission_yaml(self, entry: ArchiveEntry):
|
||||
raise NotImplementedError
|
||||
@@ -32,6 +32,9 @@ class Configuration(Namespace):
|
||||
self.user: Optional[str] = None
|
||||
self.verbose: int = 0
|
||||
|
||||
# Archiver-specific options
|
||||
self.format = 'json'
|
||||
|
||||
def process_click_arguments(self, context: click.Context):
|
||||
for arg_key in context.params.keys():
|
||||
if arg_key in vars(self) and context.params[arg_key] is not None:
|
||||
|
||||
@@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError):
|
||||
pass
|
||||
|
||||
|
||||
class ArchiverError(BulkDownloaderException):
|
||||
pass
|
||||
|
||||
|
||||
class SiteDownloaderError(BulkDownloaderException):
|
||||
pass
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ class FileNameFormatter:
|
||||
result = result.replace('/', '')
|
||||
return result
|
||||
|
||||
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
||||
def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
||||
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
|
||||
index = f'_{str(index)}' if index else ''
|
||||
if not resource.extension:
|
||||
@@ -70,7 +70,7 @@ class FileNameFormatter:
|
||||
out = []
|
||||
for i, res in enumerate(resources, start=1):
|
||||
logger.log(9, f'Formatting filename with index {i}')
|
||||
out.append((self._format_path(res, destination_directory, i), res))
|
||||
out.append((self.format_path(res, destination_directory, i), res))
|
||||
return out
|
||||
|
||||
@ staticmethod
|
||||
|
||||
32
bulkredditdownloader/tests/test_archive_entry.py
Normal file
32
bulkredditdownloader/tests/test_archive_entry.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import praw
|
||||
import pytest
|
||||
|
||||
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
|
||||
('m3reby', 27),
|
||||
))
|
||||
def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
|
||||
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||
test_archive_entry = ArchiveEntry(test_submission)
|
||||
test_archive_entry._get_comments()
|
||||
assert len(test_archive_entry.comments) >= min_comments
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
|
||||
('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}),
|
||||
('m3kua3', {'author': 'DELETED'}),
|
||||
))
|
||||
def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
|
||||
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||
test_archive_entry = ArchiveEntry(test_submission)
|
||||
test_archive_entry._get_post_details()
|
||||
assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()])
|
||||
36
bulkredditdownloader/tests/test_archiver.py
Normal file
36
bulkredditdownloader/tests/test_archiver.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import praw
|
||||
import pytest
|
||||
|
||||
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||
from bulkredditdownloader.archiver import Archiver
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize('test_submission_id', (
|
||||
'm3reby',
|
||||
))
|
||||
def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
|
||||
archiver_mock = MagicMock()
|
||||
test_path = Path(tmp_path, 'test.json')
|
||||
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||
archiver_mock.file_name_formatter.format_path.return_value = test_path
|
||||
test_entry = ArchiveEntry(test_submission)
|
||||
Archiver._write_submission_json(archiver_mock, test_entry)
|
||||
assert test_path.exists()
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
def test_write_submission_xml():
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
def test_write_submission_yaml():
|
||||
raise NotImplementedError
|
||||
@@ -88,7 +88,7 @@ def test_format_full(
|
||||
reddit_submission: praw.models.Submission):
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
||||
result = test_formatter._format_path(test_resource, Path('test'))
|
||||
result = test_formatter.format_path(test_resource, Path('test'))
|
||||
assert str(result) == expected
|
||||
|
||||
|
||||
@@ -109,7 +109,7 @@ def test_format_full_with_index_suffix(
|
||||
reddit_submission: praw.models.Submission):
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
||||
result = test_formatter._format_path(test_resource, Path('test'), index)
|
||||
result = test_formatter.format_path(test_resource, Path('test'), index)
|
||||
assert str(result) == expected
|
||||
|
||||
|
||||
@@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path):
|
||||
test_submission.id = 'BBBBBB'
|
||||
test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
|
||||
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
|
||||
result = test_formatter._format_path(test_resource, tmp_path)
|
||||
result = test_formatter.format_path(test_resource, tmp_path)
|
||||
result.parent.mkdir(parents=True)
|
||||
result.touch()
|
||||
|
||||
Reference in New Issue
Block a user