Add beginning of archiver
This commit is contained in:
67
bulkredditdownloader/archive_entry.py
Normal file
67
bulkredditdownloader/archive_entry.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import praw.models
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveEntry:
|
||||||
|
def __init__(self, submission: praw.models.Submission):
|
||||||
|
self.submission = submission
|
||||||
|
self.comments: list[dict] = []
|
||||||
|
self.post_details: dict = {}
|
||||||
|
|
||||||
|
def compile(self) -> dict:
|
||||||
|
self._fill_entry()
|
||||||
|
out = self.post_details
|
||||||
|
out['comments'] = self.comments
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _fill_entry(self):
|
||||||
|
self._get_comments()
|
||||||
|
self._get_post_details()
|
||||||
|
|
||||||
|
def _get_post_details(self):
|
||||||
|
self.post_details = {
|
||||||
|
'title': self.submission.title,
|
||||||
|
'name': self.submission.name,
|
||||||
|
'url': self.submission.url,
|
||||||
|
'selftext': self.submission.selftext,
|
||||||
|
'score': self.submission.score,
|
||||||
|
'upvote_ratio': self.submission.upvote_ratio,
|
||||||
|
'permalink': self.submission.permalink,
|
||||||
|
'id': self.submission.id,
|
||||||
|
'author': self.submission.author.name if self.submission.author else 'DELETED',
|
||||||
|
'link_flair_text': self.submission.link_flair_text,
|
||||||
|
'num_comments': self.submission.num_comments,
|
||||||
|
'over_18': self.submission.over_18,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_comments(self):
|
||||||
|
logger.debug(f'Retrieving full comment tree for submission {self.submission.id}')
|
||||||
|
self.submission.comments.replace_more(0)
|
||||||
|
for top_level_comment in self.submission.comments:
|
||||||
|
self.comments.append(self._convert_comment_to_dict(top_level_comment))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict:
|
||||||
|
out_dict = {
|
||||||
|
'author': in_comment.author.name if in_comment.author else 'DELETED',
|
||||||
|
'id': in_comment.id,
|
||||||
|
'score': in_comment.score,
|
||||||
|
'subreddit': in_comment.subreddit.display_name,
|
||||||
|
'submission': in_comment.submission.id,
|
||||||
|
'stickied': in_comment.stickied,
|
||||||
|
'body': in_comment.body,
|
||||||
|
'is_submitter': in_comment.is_submitter,
|
||||||
|
'created_utc': in_comment.created_utc,
|
||||||
|
'parent_id': in_comment.parent_id,
|
||||||
|
'replies': [],
|
||||||
|
}
|
||||||
|
in_comment.replies.replace_more(0)
|
||||||
|
for reply in in_comment.replies:
|
||||||
|
out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply))
|
||||||
|
return out_dict
|
||||||
51
bulkredditdownloader/archiver.py
Normal file
51
bulkredditdownloader/archiver.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import praw.models
|
||||||
|
|
||||||
|
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||||
|
from bulkredditdownloader.configuration import Configuration
|
||||||
|
from bulkredditdownloader.downloader import RedditDownloader
|
||||||
|
from bulkredditdownloader.exceptions import ArchiverError
|
||||||
|
from bulkredditdownloader.resource import Resource
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Archiver(RedditDownloader):
|
||||||
|
def __init__(self, args: Configuration):
|
||||||
|
super(Archiver, self).__init__(args)
|
||||||
|
|
||||||
|
def download(self):
|
||||||
|
for generator in self.reddit_lists:
|
||||||
|
for submission in generator:
|
||||||
|
logger.debug(f'Attempting to archive submission {submission.id}')
|
||||||
|
self._write_submission(submission)
|
||||||
|
|
||||||
|
def _write_submission(self, submission: praw.models.Submission):
|
||||||
|
archive_entry = ArchiveEntry(submission)
|
||||||
|
if self.args.format == 'json':
|
||||||
|
self._write_submission_json(archive_entry)
|
||||||
|
elif self.args.format == 'xml':
|
||||||
|
self._write_submission_xml(archive_entry)
|
||||||
|
elif self.args.format == 'yaml':
|
||||||
|
self._write_submission_yaml(archive_entry)
|
||||||
|
else:
|
||||||
|
raise ArchiverError(f'Unknown format {self.args.format} given')
|
||||||
|
logger.info(f'Record for submission {submission.id} written to disk')
|
||||||
|
|
||||||
|
def _write_submission_json(self, entry: ArchiveEntry):
|
||||||
|
resource = Resource(entry.submission, '', '.json')
|
||||||
|
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
|
||||||
|
with open(file_path, 'w') as file:
|
||||||
|
logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}')
|
||||||
|
json.dump(entry.compile(), file)
|
||||||
|
|
||||||
|
def _write_submission_xml(self, entry: ArchiveEntry):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _write_submission_yaml(self, entry: ArchiveEntry):
|
||||||
|
raise NotImplementedError
|
||||||
@@ -32,6 +32,9 @@ class Configuration(Namespace):
|
|||||||
self.user: Optional[str] = None
|
self.user: Optional[str] = None
|
||||||
self.verbose: int = 0
|
self.verbose: int = 0
|
||||||
|
|
||||||
|
# Archiver-specific options
|
||||||
|
self.format = 'json'
|
||||||
|
|
||||||
def process_click_arguments(self, context: click.Context):
|
def process_click_arguments(self, context: click.Context):
|
||||||
for arg_key in context.params.keys():
|
for arg_key in context.params.keys():
|
||||||
if arg_key in vars(self) and context.params[arg_key] is not None:
|
if arg_key in vars(self) and context.params[arg_key] is not None:
|
||||||
|
|||||||
@@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiverError(BulkDownloaderException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class SiteDownloaderError(BulkDownloaderException):
|
class SiteDownloaderError(BulkDownloaderException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class FileNameFormatter:
|
|||||||
result = result.replace('/', '')
|
result = result.replace('/', '')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
||||||
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
|
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
|
||||||
index = f'_{str(index)}' if index else ''
|
index = f'_{str(index)}' if index else ''
|
||||||
if not resource.extension:
|
if not resource.extension:
|
||||||
@@ -70,7 +70,7 @@ class FileNameFormatter:
|
|||||||
out = []
|
out = []
|
||||||
for i, res in enumerate(resources, start=1):
|
for i, res in enumerate(resources, start=1):
|
||||||
logger.log(9, f'Formatting filename with index {i}')
|
logger.log(9, f'Formatting filename with index {i}')
|
||||||
out.append((self._format_path(res, destination_directory, i), res))
|
out.append((self.format_path(res, destination_directory, i), res))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ staticmethod
|
@ staticmethod
|
||||||
|
|||||||
32
bulkredditdownloader/tests/test_archive_entry.py
Normal file
32
bulkredditdownloader/tests/test_archive_entry.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
import praw
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.online
|
||||||
|
@pytest.mark.reddit
|
||||||
|
@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
|
||||||
|
('m3reby', 27),
|
||||||
|
))
|
||||||
|
def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
|
||||||
|
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||||
|
test_archive_entry = ArchiveEntry(test_submission)
|
||||||
|
test_archive_entry._get_comments()
|
||||||
|
assert len(test_archive_entry.comments) >= min_comments
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.online
|
||||||
|
@pytest.mark.reddit
|
||||||
|
@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
|
||||||
|
('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}),
|
||||||
|
('m3kua3', {'author': 'DELETED'}),
|
||||||
|
))
|
||||||
|
def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
|
||||||
|
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||||
|
test_archive_entry = ArchiveEntry(test_submission)
|
||||||
|
test_archive_entry._get_post_details()
|
||||||
|
assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()])
|
||||||
36
bulkredditdownloader/tests/test_archiver.py
Normal file
36
bulkredditdownloader/tests/test_archiver.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import praw
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from bulkredditdownloader.archive_entry import ArchiveEntry
|
||||||
|
from bulkredditdownloader.archiver import Archiver
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.online
|
||||||
|
@pytest.mark.reddit
|
||||||
|
@pytest.mark.parametrize('test_submission_id', (
|
||||||
|
'm3reby',
|
||||||
|
))
|
||||||
|
def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
|
||||||
|
archiver_mock = MagicMock()
|
||||||
|
test_path = Path(tmp_path, 'test.json')
|
||||||
|
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||||
|
archiver_mock.file_name_formatter.format_path.return_value = test_path
|
||||||
|
test_entry = ArchiveEntry(test_submission)
|
||||||
|
Archiver._write_submission_json(archiver_mock, test_entry)
|
||||||
|
assert test_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
|
def test_write_submission_xml():
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
|
def test_write_submission_yaml():
|
||||||
|
raise NotImplementedError
|
||||||
@@ -88,7 +88,7 @@ def test_format_full(
|
|||||||
reddit_submission: praw.models.Submission):
|
reddit_submission: praw.models.Submission):
|
||||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
||||||
result = test_formatter._format_path(test_resource, Path('test'))
|
result = test_formatter.format_path(test_resource, Path('test'))
|
||||||
assert str(result) == expected
|
assert str(result) == expected
|
||||||
|
|
||||||
|
|
||||||
@@ -109,7 +109,7 @@ def test_format_full_with_index_suffix(
|
|||||||
reddit_submission: praw.models.Submission):
|
reddit_submission: praw.models.Submission):
|
||||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
|
||||||
result = test_formatter._format_path(test_resource, Path('test'), index)
|
result = test_formatter.format_path(test_resource, Path('test'), index)
|
||||||
assert str(result) == expected
|
assert str(result) == expected
|
||||||
|
|
||||||
|
|
||||||
@@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path):
|
|||||||
test_submission.id = 'BBBBBB'
|
test_submission.id = 'BBBBBB'
|
||||||
test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
|
test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg')
|
||||||
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
|
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
|
||||||
result = test_formatter._format_path(test_resource, tmp_path)
|
result = test_formatter.format_path(test_resource, tmp_path)
|
||||||
result.parent.mkdir(parents=True)
|
result.parent.mkdir(parents=True)
|
||||||
result.touch()
|
result.touch()
|
||||||
|
|||||||
Reference in New Issue
Block a user