Merge pull request #701 from OMEGARAZER/development

This commit is contained in:
Serene
2022-12-01 12:04:28 +10:00
committed by GitHub
28 changed files with 96 additions and 86 deletions

View File

@@ -10,20 +10,24 @@ assignees: ''
- [ ] I am reporting a bug.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](https://github.com/aliparlakci/bulk-downloader-for-reddit/blob/master/docs/CONTRIBUTING.md#opening-an-issue)
## Description
A clear and concise description of what the bug is.
## Command
```
```text
Paste here the command(s) that causes the bug
```
## Environment (please complete the following information):
- OS: [e.g. Windows 10]
- Python version: [e.g. 3.9.4]
## Environment (please complete the following information)
- OS: [e.g. Windows 10]
- Python version: [e.g. 3.9.4]
## Logs
```
```text
Paste the log output here.
```

View File

@@ -10,6 +10,7 @@ assignees: ''
- [ ] I am requesting a feature.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](../../README.md#configuration)
## Description
Clearly state the current situation and issues you experience. Then, explain how this feature would solve these issues and make life easier. Also, explain the feature with as many detail as possible.

View File

@@ -10,9 +10,11 @@ assignees: ''
- [ ] I am requesting a site support.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](../../README.md#configuration)
## Site
Provide a URL to domain of the site.
## Example posts
Provide example reddit posts with the domain.

View File

@@ -75,7 +75,7 @@ class Configuration(Namespace):
if not yaml_file_loc.exists():
logger.error(f'No YAML file found at {yaml_file_loc}')
return
with open(yaml_file_loc) as file:
with yaml_file_loc.open() as file:
try:
opts = yaml.load(file, Loader=yaml.FullLoader)
except yaml.YAMLError as e:

View File

@@ -91,7 +91,7 @@ class RedditConnector(metaclass=ABCMeta):
logger.log(9, 'Created site authenticator')
self.args.skip_subreddit = self.split_args_input(self.args.skip_subreddit)
self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit])
self.args.skip_subreddit = {sub.lower() for sub in self.args.skip_subreddit}
def read_config(self):
"""Read any cfg values that need to be processed"""
@@ -113,7 +113,7 @@ class RedditConnector(metaclass=ABCMeta):
def parse_disabled_modules(self):
disabled_modules = self.args.disable_module
disabled_modules = self.split_args_input(disabled_modules)
disabled_modules = set([name.strip().lower() for name in disabled_modules])
disabled_modules = {name.strip().lower() for name in disabled_modules}
self.args.disable_module = disabled_modules
logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}')
@@ -249,7 +249,7 @@ class RedditConnector(metaclass=ABCMeta):
if self.args.authenticate:
try:
subscribed_subreddits = list(self.reddit_instance.user.subreddits(limit=None))
subscribed_subreddits = set([s.display_name for s in subscribed_subreddits])
subscribed_subreddits = {s.display_name for s in subscribed_subreddits}
except prawcore.InsufficientScope:
logger.error('BDFR has insufficient scope to access subreddit lists')
else:
@@ -428,7 +428,7 @@ class RedditConnector(metaclass=ABCMeta):
if not id_file.exists():
logger.warning(f'ID file at {id_file} does not exist')
continue
with open(id_file, 'r') as file:
with id_file.open('r') as file:
for line in file:
out.append(line.strip())
return set(out)

View File

@@ -4,4 +4,4 @@ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg
scopes = identity, history, read, save, mysubreddits
backup_log_count = 3
max_wait_time = 120
time_format = ISO
time_format = ISO

View File

@@ -36,7 +36,7 @@ class DownloadFilter:
combined_extensions = '|'.join(self.excluded_extensions)
pattern = re.compile(r'.*({})$'.format(combined_extensions))
if re.match(pattern, resource_extension):
logger.log(9, f'Url "{resource_extension}" matched with "{str(pattern)}"')
logger.log(9, f'Url "{resource_extension}" matched with "{pattern}"')
return False
else:
return True
@@ -47,7 +47,7 @@ class DownloadFilter:
combined_domains = '|'.join(self.excluded_domains)
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
if re.match(pattern, url):
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
logger.log(9, f'Url "{url}" matched with "{pattern}"')
return False
else:
return True

View File

@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
def _calc_hash(existing_file: Path):
chunk_size = 1024 * 1024
md5_hash = hashlib.md5()
with open(existing_file, 'rb') as file:
with existing_file.open('rb') as file:
chunk = file.read(chunk_size)
while chunk:
md5_hash.update(chunk)
@@ -127,7 +127,7 @@ class RedditDownloader(RedditConnector):
f' in submission {submission.id}')
return
try:
with open(destination, 'wb') as file:
with destination.open('wb') as file:
file.write(res.content)
logger.debug(f'Written file to {destination}')
except OSError as e:

View File

@@ -107,7 +107,7 @@ class FileNameFormatter:
destination_directory,
*[self._format_name(resource.source_submission, part) for part in self.directory_format_string],
)
index = f'_{str(index)}' if index else ''
index = f'_{index}' if index else ''
if not resource.extension:
raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
file_name = str(self._format_name(resource.source_submission, self.file_format_string))

View File

@@ -48,11 +48,11 @@ class Youtube(BaseDownloader):
raise SiteDownloaderError(f'Youtube download failed: {e}')
downloaded_files = list(download_path.iterdir())
if len(downloaded_files) > 0:
if downloaded_files:
downloaded_file = downloaded_files[0]
else:
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
with open(downloaded_file, 'rb') as file:
with downloaded_file.open('rb') as file:
content = file.read()
return content
return download

View File

@@ -1,5 +1,5 @@
if (-not ([string]::IsNullOrEmpty($env:REDDIT_TOKEN)))
{
copy .\\bdfr\\default_config.cfg .\\test_config.cfg
echo "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg
}
Copy-Item .\\bdfr\\default_config.cfg .\\test_config.cfg
Write-Output "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg
}

View File

@@ -1,4 +1,6 @@
if [ ! -z "$REDDIT_TOKEN" ]
#!/bin/bash
if [ -n "$REDDIT_TOKEN" ]
then
cp ./bdfr/default_config.cfg ./test_config.cfg
echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg

View File

@@ -18,18 +18,18 @@ Another major part of the ethos of the design is DOTADIW, Do One Thing And Do It
The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class.
1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc.
1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc.
2. The RedditDownloader scrapes raw submissions from Reddit via several methods relating to different sources. A source is defined as a single stream of submissions from a subreddit, multireddit, or user list.
3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct.
3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct.
4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource.
4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource.
5. This is returned to the RedditDownloader in the form of a Resource object. This holds the URL and some other information for the final resource.
6. The Resource is passed through the DownloadFilter instantiated in step 1.
7. The destination file name for the Resource is calculated. If it already exists, then the Resource will be discarded.
8. Here the actual data is downloaded to the Resource and a hash calculated which is used to find duplicates.

View File

@@ -69,8 +69,6 @@ members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
available at <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
[homepage]: https://www.contributor-covenant.org

View File

@@ -11,19 +11,21 @@ All communication on GitHub, Discord, email, or any other medium must conform to
**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue.
### Bugs
When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug.
If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug.
When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug.
If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug.
### Feature requests
In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers.
In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers.
## Pull Requests
Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually writing any code.
Once you have done both of these, the below list shows the path that should be followed when writing a PR.
1. If an issue does not already exist, open one that will relate to the PR.
2. Ensure that any changes fit into the architecture specified above.
3. Ensure that you have written tests that cover the new code.
@@ -32,24 +34,26 @@ Once you have done both of these, the below list shows the path that should be f
6. Open a pull request that references the relevant issue.
7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here.
Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR.
Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR.
## Preparing the environment for development
Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this.
Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this.
BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it.
To install the program, clone the repository and run pip inside the project's root directory:
```bash
$ git clone https://github.com/aliparlakci/bulk-downloader-for-reddit.git
$ cd ./bulk-downloader-for-reddit
$ python3 -m pip install -e .
git clone https://github.com/aliparlakci/bulk-downloader-for-reddit.git
cd ./bulk-downloader-for-reddit
python3 -m pip install -e .
```
**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it)
**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it)
Then, you can run the program from anywhere in your disk as such:
```bash
bdfr
```
@@ -104,20 +108,20 @@ To exclude one or more marks, the following command can be used, substituting th
pytest -m "not online"
pytest -m "not reddit and not authenticated"
```
### Configuration for authenticated tests
There should be configuration file `test_config.cfg` in the project's root directory to be able to run the integration tests with reddit authentication. See how to create such files [here](../README.md#configuration). The easiest way of creating this file is copying your existing `default_config.cfg` file from the path stated in the previous link and renaming it to `test_config.cfg` Be sure that user_token key exists in test_config.cfg.
---
For more details, review the pytest documentation that is freely available online.
Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities.
### Writing Tests
When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one.
When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one.
If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with the `click` framework and are located in their own file.

View File

@@ -1,7 +1,7 @@
[pytest]
addopts = --strict-markers
markers =
online: tests require a connection to the internet
reddit: tests require a connection to Reddit
slow: test is slow to run
authenticated: test requires an authenticated Reddit instance

View File

@@ -2,10 +2,10 @@
Due to the verboseness of the logs, a great deal of information can be gathered quite easily from the BDFR's logfiles. In this folder, there is a selection of scripts that parse these logs, scraping useful bits of information. Since the logfiles are recurring patterns of strings, it is a fairly simple matter to write scripts that utilise tools included on most Linux systems.
- [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids)
- [Script to extract all failed download IDs](#extract-all-failed-ids)
- [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps)
- [Printing summary statistics for a run](#printing-summary-statistics)
- [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids)
- [Script to extract all failed download IDs](#extract-all-failed-ids)
- [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps)
- [Printing summary statistics for a run](#printing-summary-statistics)
## Extract all Successfully Downloaded IDs
@@ -58,7 +58,7 @@ A simple script has been included to print sumamry statistics for a run of the B
This will create an output like the following:
```
```text
Downloaded submissions: 250
Failed downloads: 103
Files already downloaded: 20073

View File

@@ -1,21 +1,21 @@
if (Test-Path -Path $args[0] -PathType Leaf) {
$file=$args[0]
$file=$args[0]
}
else {
Write-Host "CANNOT FIND LOG FILE"
Exit 1
Write-Host "CANNOT FIND LOG FILE"
Exit 1
}
if ($args[1] -ne $null) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
if ($null -ne $args[1]) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
}
else {
$output="./failed.txt"
$output="./failed.txt"
}
Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output
Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output
Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output
Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output
Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output
Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } >> $output
Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output

View File

@@ -1,17 +1,17 @@
if (Test-Path -Path $args[0] -PathType Leaf) {
$file=$args[0]
$file=$args[0]
}
else {
Write-Host "CANNOT FIND LOG FILE"
Exit 1
Write-Host "CANNOT FIND LOG FILE"
Exit 1
}
if ($args[1] -ne $null) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
if ($null -ne $args[1]) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
}
else {
$output="./successful.txt"
$output="./successful.txt"
}
Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output

View File

@@ -1,17 +1,17 @@
if (Test-Path -Path $args[0] -PathType Leaf) {
$file=$args[0]
$file=$args[0]
}
else {
Write-Host "CANNOT FIND LOG FILE"
Exit 1
Write-Host "CANNOT FIND LOG FILE"
Exit 1
}
if ($args[1] -ne $null) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
if ($null -ne $args[1]) {
$output=$args[1]
Write-Host "Outputting IDs to $output"
}
else {
$output="./successful.txt"
$output="./successful.txt"
}
Write-Host -NoNewline "Downloaded submissions: "

View File

@@ -1,2 +1 @@
[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404

View File

@@ -1,2 +1,2 @@
[2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50]
[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1]
[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1]

View File

@@ -10,7 +10,7 @@ author_email = parlakciali@gmail.com
maintainer = Serene Arc
maintainer_email = serenical@gmail.com
license = GPLv3
classifiers =
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Natural Language :: English

View File

@@ -47,7 +47,7 @@ def assert_all_results_are_submissions(result_limit: int, results: list[Iterator
def assert_all_results_are_submissions_or_comments(result_limit: int, results: list[Iterator]) -> list:
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) or isinstance(res, praw.models.Comment) for res in results])
assert all([isinstance(res, (praw.models.Submission, praw.models.Comment)) for res in results])
assert not any([isinstance(m, MagicMock) for m in results])
if result_limit is not None:
assert len(results) == result_limit
@@ -259,7 +259,7 @@ def test_get_subreddit_search(
assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len
if max_expected_len != 0:
assert len(results) > 0
assert results
assert not any([isinstance(m, MagicMock) for m in results])
@@ -356,7 +356,7 @@ def test_get_subscribed_subreddits(downloader_mock: MagicMock, authenticated_red
downloader_mock.sort_filter = RedditTypes.SortType.HOT
results = RedditConnector.get_subreddits(downloader_mock)
assert all([isinstance(s, praw.models.ListingGenerator) for s in results])
assert len(results) > 0
assert results
@pytest.mark.parametrize(('test_name', 'expected'), (

View File

@@ -152,7 +152,7 @@ def test_download_submission_hash_exists(
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
output = capsys.readouterr()
assert len(folder_contents) == 0
assert not folder_contents
assert re.search(r'Resource hash .*? downloaded elsewhere', output.out)

View File

@@ -66,6 +66,6 @@ def test_token_manager_write(example_config: configparser.ConfigParser, tmp_path
test_manager = OAuth2TokenManager(example_config, test_path)
test_manager.post_refresh_callback(mock_authoriser)
assert example_config.get('DEFAULT', 'user_token') == 'changed_token'
with open(test_path, 'r') as file:
with test_path.open('r') as file:
file_contents = file.read()
assert 'user_token = changed_token' in file_contents