Merge pull request #280 from aliparlakci/v2

V2
This commit is contained in:
Ali Parlakçı
2021-04-20 19:02:39 +03:00
committed by GitHub
93 changed files with 4505 additions and 3190 deletions

29
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@@ -0,0 +1,29 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''
---
- [ ] I am reporting a bug.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](README.md#configuration)
## Description
A clear and concise description of what the bug is.
## Command
```
Paste here the command(s) that causes the bug
```
## Environment (please complete the following information):
- OS: [e.g. Windows 10]
- Python version: [e.g. 3.9.4]
## Logs
```
Paste the log output here.
```

View File

@@ -0,0 +1,15 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[FEATURE]"
labels: ''
assignees: ''
---
- [ ] I am requesting a feature.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](README.md#configuration)
## Description
Clearly state the current situation and issues you experience. Then, explain how this feature would solve these issues and make life easier. Also, explain the feature with as many detail as possible.

View File

@@ -0,0 +1,18 @@
---
name: Site Support request
about: Describe this issue template's purpose here.
title: "[SITE]"
labels: ''
assignees: ''
---
- [ ] I am requesting a site support.
- [ ] I am running the latest version of BDfR
- [ ] I have read the [Opening an issue](README.md#configuration)
## Site
Provide a URL to domain of the site.
## Example posts
Provide example reddit posts with the domain.

49
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,49 @@
name: Python Test
on:
push:
branches: [ v2 ]
pull_request:
branches: [ v2 ]
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.9]
steps:
- uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip flake8 pytest pytest-cov
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Setup test configuration
run: |
cp bdfr/default_config.cfg ./test_config.cfg
echo -e "\nuser_token = ${{ secrets.REDDIT_TEST_TOKEN }}" >> ./test_config.cfg
- name: Lint w/ flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
- name: Test w/ PyTest
run: |
pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html
- name: Upload coverage report
uses: actions/upload-artifact@v2
with:
name: coverage_report
path: htmlcov/

150
.gitignore vendored
View File

@@ -1,9 +1,141 @@
.DS_Store
build/
dist/
MANIFEST
__pycache__/
src/__pycache__/
config.json
env/
.vscode/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Test configuration file
test_config.cfg

View File

@@ -1,27 +0,0 @@
FROM python:3.9
LABEL Description="This image enables running Buld Downloader for Reddit with in a container environment" Version="0.0.1"
ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE 1
EXPOSE 8080
EXPOSE 7634
# Install dependencies
RUN apt-get update \
&& apt-get install -y build-essential \
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
# Python requirements
COPY requirements.txt /requirements.txt
RUN pip install --no-cache-dir -r /requirements.txt \
&& rm -rf /requirements.txt
# Copy over project files
COPY . /bdfr
WORKDIR /bdfr
# Useful so the image doubles as reference to the binary
ENTRYPOINT ["python", "script.py"]
CMD ["python", "script.py", "-d", "downloads"]

378
README.md
View File

@@ -1,213 +1,259 @@
# [Bulk Downloader for Reddit v2-beta](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/v2) is out!
[Serene-Arc](https://github.com/Serene-Arc) has reimplemented the Bulk Downloader for Reddit so that it is more flexible, roboust and is easier to contribute. If you are having issues with master, v2 is worth checking out. After cloning the repository, switch to the branch *v2* with `git checkout v2`
# Bulk Downloader for Reddit
[![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml)
# 📥 Bulk Downloader for Reddit
This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources)
Downloads reddit posts. Made by [u/aliparlakci](https://reddit.com/u/aliparlakci)
Please give feedback *(errors, feature requests, etc.)* on the [Issues](https://github.com/aliparlakci/bulk-downloader-for-reddit/issues) page. I will try to resolve them ASAP.
If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate.
## [Download the latest release here](https://github.com/aliparlakci/bulk-downloader-for-reddit/releases/latest)
## Installation
*Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the requirement. Then, you can install it as such:
```bash
python3 -m pip install bdfr
```
## 🚀 How to use
If you run **Windows**, after you extract the zip file, double-click on the *bulk-downloader-for-reddit.exe*. The program will guide you through. Also, take a look at the [Setting up the program](#🔨-setting-up-the-program) section. **However**, Bulk Dowloader for Reddit has a plenty of features which can only be activated via command line arguments. See [Options](#⚙-Options) for it.
If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development)
Unfortunately, there is no binary for **MacOS** or **Linux**. If you are a MacOS or Linux user, you must use the program from the source code. See the [Interpret from source code](docs/INTERPRET_FROM_SOURCE.md) page.
However, binary version for Linux is being worked. So, stay tuned.
OR, regardless of your operating system, you can fire up the program from the **source code**.
## Usage
### `python3 -m pip install -r requirements.txt`
The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user.
### `python3 script.py`
There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML.
See the [Interpret from source code](docs/INTERPRET_FROM_SOURCE.md) page for more information.
After installation, run the program from any directory as shown below:
```bash
python3 -m bdfr download
```
```bash
python3 -m bdfr archive
```
## 🔨 Setting up the program
### 📽 ffmpeg Library
Program needs **ffmpeg software** to add audio to some video files. However, installing it is **voluntary**. Although the program can still run with no errors without the ffmpeg library, some video files might have no sound.
Install it through a package manager such as **Chocolatey** in Windows, **apt** in Linux or **Homebrew** in MacOS:
However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are:
- **in Windows**: After you **[install Chocolatey](https://chocolatey.org/install)**, type **`choco install ffmpeg`** in either Command Promt or Powershell.
- **in Linux**: Type **`sudo apt install ffmpeg`** in Terminal.
- **in MacOS**: After you **[install Homebrew](https://brew.sh/)**, type **`brew install ffmpeg`** in Terminal
```bash
python3 -m bdfr download --subreddit Python -L 10
```
```bash
python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'
```
```bash
python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links
```
```bash
python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme ''
```
OR, [Download ffmpeg](https://www.ffmpeg.org/download.html) manually on your system and [add the bin folder in the downloaded folder's directory to `PATH` of your system.](https://www.architectryan.com/2018/03/17/add-to-the-path-on-windows-10/) However, package manager option is suggested.
## Options
## 🐋 Docker
There is also a complete ready to go Docker integration. Install **Docker** and **docker-compose**. Then run the following command from the repository root:
### `docker-compose run --service-ports bdfr`
And you'll find youself right in the app. The files will be downloaded to `downloads/`. Since it is docker, you may want to change the ownership of the files once you're done (belongs to root by default).
The following options are common between both the `archive` and `download` commands of the BDFR.
_Credits to [wAuner](https://github.com/wAuner)_
- `directory`
- This is the directory to which the BDFR will download and place all files
- `--authenticate`
- This flag will make the BDFR attempt to use an authenticated Reddit session
- See [Authentication](#authentication-and-security) for more details
- `--config`
- If the path to a configuration file is supplied with this option, the BDFR will use the specified config
- See [Configuration Files](#configuration) for more details
- `--saved`
- This option will make the BDFR use the supplied user's saved posts list as a download source
- This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me`
- `--search`
- This will apply the specified search term to specific lists when scraping submissions
- A search term can only be applied to subreddits and multireddits, supplied with the `- s` and `-m` flags respectively
- `--submitted`
- This will use a user's submissions as a source
- A user must be specified with `--user`
- `--upvoted`
- This will use a user's upvoted posts as a source of posts to scrape
- This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me`
- `-L, --limit`
- This is the limit on the number of submissions retrieve
- Default is max possible
- Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped
- If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.**
- `-S, --sort`
- This is the sort type for each applicable submission source supplied to the BDFR
- This option does not apply to upvoted or saved posts when scraping from these sources
- The following options are available:
- `controversial`
- `hot` (default)
- `new`
- `relevance` (only available when using `--search`)
- `rising`
- `top`
- `-l, --link`
- This is a direct link to a submission to download, either as a URL or an ID
- Can be specified multiple times
- `-m, --multireddit`
- This is the name of a multireddit to add as a source
- Can be specified multiple times
- This can be done by using `-m` multiple times
- Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites'`
- The specified multireddits must all belong to the user specified with the `--user` option
- `-s, --subreddit`
- This adds a subreddit as a source
- Can be used mutliple times
- This can be done by using `-s` multiple times
- Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry'`
- `-t, --time`
- This is the time filter that will be applied to all applicable sources
- This option does not apply to upvoted or saved posts when scraping from these sources
- The following options are available:
- `all` (default)
- `hour`
- `day`
- `week`
- `month`
- `year`
- `-u, --user`
- This specifies the user to scrape in concert with other options
- When using `--authenticate`, `--user me` can be used to refer to the authenticated user
- `-v, --verbose`
- Increases the verbosity of the program
- Can be specified multiple times
## Options
### Downloader Options
Some of the below features are available only through command-line.
Open the [Command Promt](https://youtu.be/bgSSJQolR0E?t=18), [Powershell](https://youtu.be/bgSSJQolR0E?t=18) or [Terminal](https://youtu.be/Pz4yHAB3G8w?t=31) in the folder that contains bulk-downloader-for-reddit file (click on the links to see how)
After you type **`bulk-downloader-for-reddit.exe`**, type the preffered options.
The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory.
Example: **`bulk-downloader-for-reddit.exe --subreddit pics --sort top --limit 10`**
- `--exclude-id`
- This will skip the download of any submission with the ID provided
- Can be specified multiple times
- `--exclude-id-file`
- This will skip the download of any submission with any of the IDs in the files provided
- Can be specified multiple times
- Format is one ID per line
- `--make-hard-links`
- This flag will create hard links to an existing file when a duplicate is downloaded
- This will make the file appear in multiple directories while only taking the space of a single instance
- `--max-wait-time`
- This option specifies the maximum wait time for downloading a resource
- The default is 120 seconds
- See [Rate Limiting](#rate-limiting) for details
- `--no-dupes`
- This flag will not redownload files if they already exist somewhere in the root folder tree
- This is calculated by MD5 hash
- `--search-existing`
- This will make the BDFR compile the hashes for every file in `directory` and store them to remove duplicates if `--no-dupes` is also supplied
- `--file-scheme`
- Sets the scheme for files
- Default is `{REDDITOR}_{TITLE}_{POSTID}`
- See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details
- `--folder-scheme`
- Sets the scheme for folders
- Default is `{SUBREDDIT}`
- See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details
- `--skip-domain`
- This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded
- Can be specified multiple times
- `--skip`
- This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded
- Can be specified multiple times
- `--skip-subreddit`
- This skips all submissions from the specified subreddit
- Can be specified multiple times
- Also accepts CSV subreddit names
## **`--subreddit`**
Downloads posts from given subreddit(s). Takes number of subreddit names as a paramater.
Example usage: **`--subreddit IAmA pics --sort hot --limit 10`**
### Archiver Options
## **`--multireddit`**
Downloads posts from given subreddit. Takes a single multireddit name as a parameter. **`--user`** option is required.
Example usage: **`--multireddit myMulti --user me --sort top --time week`**
The following options are for the `archive` command specifically.
## **`--search`**
Searches for given query in given subreddit(s) or multireddit. Takes a search query as a parameter. **`--subreddit`** or **`--multireddit`** option is required. **`--sort`** option is required.
Example usage: **`--search carter --subreddit funny`**
## **`--submitted`**
Downloads given redditor's submitted posts. Does not take any parameter. **`--user`** option is required.
- `--all-comments`
- When combined with the `--user` option, this will download all the user's comments
- `-f, --format`
- This specifies the format of the data file saved to disk
- The following formats are available:
- `json` (default)
- `xml`
- `yaml`
Example usage: **`--submitted --user spɛz --sort top --time week`**
## **`--upvoted`**
Downloads given redditor's upvoted posts. Does not take any parameter. **`--user`** option is required.
## Authentication and Security
Example usage: **`--upvoted --user spɛz`**
## **`--saved`**
Downloads logged in redditor's saved posts. Does not take any parameter. Example usage: **`--saved`**
## **`--link`**
Takes a reddit link as a parameter and downloads the posts in the link. Put the link in " " (double quotes).
Example usage: **`--link "https://www.reddit.com/r/funny/comments/25blmh/"`**
The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits.
## **`--log`**
Program saves the found posts into POSTS.json file and the failed posts to FAILED.json file in LOG_FILES folder. You can use those files to redownload the posts inside them.
Uses a .json file to redownload posts from. Takes single directory to a .json file as a parameter.
To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved or upvoted submissions and identify as you.
Example usage: **`--log D:\pics\LOG_FILES\FAILED.json`**
If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on.
---
## Changing Permissions
## **`--user`**
Takes a reddit username as a parameter. Example usage: **`--user spɛz`**
## **`--sort`**
Takes a valid sorting type as a parameter. Valid sort types are `hot`, `top`, `new`, `rising`, `controversial` and `relevance` (if you are using `--search` option)
Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it *is* allowed by the BDFR.
Example usage: **`--sort top`**
## **`--time`**
Takes a valid time as a parameter. Valid times are `hour`, `day`, `week`, `month`, `year` and `all`. Example usage: **`--time all`**
## **`--limit`**
Takes a number to specify how many should program get. Upper bound is 1000 posts for **each** subreddit. For example, if you are downloading posts from pics and IAmA, the upper bound is 2000. Do not use the option to set it to highest bound possible.
The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is. Never grant more permissions than you absolutely need.
Example usage: **`--limit 500`**
For more details on the configuration file and the values therein, see [Configuration Files](#configuration).
---
## Folder and File Name Schemes
## **`--skip`**
Takes a number of file types as a parameter to skip the posts from those domains. Valid file types are `images`, `videos`, `gifs`, `self`
Example usage: **`--skip self videos`**
## **`--skip-domain`**
Takes a number of domains as a parameter to skip the posts from those domains.
The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given which will be replaced with properties from a submission when downloading it. The scheme format takes the form of `{KEY}`, where `KEY` is a string from the below list.
Example usage: **`--skip v.redd.it youtube.com youtu.be`**
## **`--quit`**
Automatically quits the application after it finishes. Otherwise, it will wait for an input to quit.
- `DATE`
- `FLAIR`
- `POSTID`
- `REDDITOR`
- `SUBREDDIT`
- `TITLE`
- `UPVOTES`
Example usage: **`--quit`**
## **`--directory`**
Takes a directory which the posts should be downloaded to. Overrides the given default directory. Use `..\` to imply upper level and `.\` to imply the current level.
Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Static strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. For example, the previous string will result in the following submission file names:
Example usage: **`--directory D:\bdfr\`**
Example usage: **`--directory ..\images\`**
Example usage: **`-d ..\images\`**
Example usage: **`-d .\`**
## **`--set-filename`**
Starts the program to set a filename template to use for downloading posts. **Does not take any parameter.**
When the programs starts, you will be prompted to type a filename template. Use `SUBREDDIT`, `REDDITOR`, `POSTID`, `TITLE`, `UPVOTES`, `FLAIR`, `DATE` in curly brakets `{ }` to refer to the corrosponding property of a post.
- `download_aaaaaa.png`
- `download_bbbbbb.png`
❗ Do NOT change the filename structure frequently. If you did, the program could not find duplicates and would download the already downloaded files again. This would not create any duplicates in the directory but the program would not be as snappy as it should be.
The default filename template is **`{REDDITOR}_{TITLE}_{POSTID}`**
At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. In both cases, there will be no separation between all submissions.
Example usage: **`--set-filename`**
## **`--set-folderpath`**
Starts the program to set a folder structure to use for downloading posts. **Does not take any parameter.**
When the programs starts, you will be prompted to type a filename template. Use `SUBREDDIT`, `REDDITOR`, `POSTID`, `TITLE`, `UPVOTES`, `FLAIR`, `DATE` in curly brakets `{ }` to refer to the corrosponding property of a post. Do not put slashes `/` or backslashes `\` at either ends. For instance, **`{REDDITOR}/{SUBREDDIT}/{FLAIR}`**
The default filename template is **`{SUBREDDIT}`**
It is highly recommended that the file name scheme contain the parameter `{POSTID}` as this is **the only parameter guaranteed to be unique**. No combination of other keys will necessarily be unique and may result in posts being skipped as the BDFR will see files by the same name and skip the download, assuming that they are already downloaded.
Example usage: **`--set-folderpath`**
## **`--set-default-directory`**
Starts the program to set a default directory to use in case no directory is given. **Does not take any parameter.**
When the programs starts, you will be prompted to type a default directory. You can use {time} in foler names to use to timestamp it. For instance, **`D:\bdfr\posts_{time}`**
## Configuration
Example usage: **`--set-default-directory`**
## **`--use-local-config`**
Sets the program to use config.json file in the current directory. Creates it if it does not exists. Useful for having different configurations. **Does not take any parameter.**
Example usage: **`--use-local-config`**
## **`--no-dupes`**
Skips the same posts in different subreddits. Does not take any parameter.
The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be:
- `C:\Users\<User>\AppData\Local\BDFR\bdfr`
Example usage: **`--no-dupes`**
## **`--no-download`**
Quits the program without downloading the posts. Does not take any parameter
On Mac OSX, this will be:
- `~/Library/Application Support/bdfr`.
Lastly, on a Linux system, this will be:
- `~/.local/share/bdfr`
Example usage: **`--no-download`**
## **`--downloaded-posts`**
Takes a file directory as a parameter and skips the posts if it matches with the post IDs inside the file. It also saves the newly downloaded posts to the given file.
The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report.
Example usage: **`--downloaded-posts D:\bdfr\ALL_POSTS.txt`**
## **`--downloaded-delay`**
When specified, it delays every download for given seconds.
### Configuration File
## ❔ FAQ
The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied.
### I am running the script on a headless machine or on a remote server. How can I authenticate my reddit account?
- Download the script on your everday computer and run it for once.
- Authenticate the program on both reddit and imgur.
- Go to your Home folder (for Windows users it is `C:\Users\[USERNAME]\`, for Linux users it is `/home/[USERNAME]`)
- Copy the *config.json* file inside the Bulk Downloader for Reddit folder and paste it **next to** the file that you run the program.
- `backup_log_count`
- `max_wait_time`
- `client_id`
- `client_secret`
- `scopes`
### How can I change my credentials?
- All of the user data is held in **config.json** file which is in a folder named "Bulk Downloader for Reddit" in your **Home** directory. You can edit them, there.
All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default.
Also if you already have a config.json file, you can paste it **next to** the script and override the one on your Home directory.
Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number.
### What do the dots resemble when getting posts?
- Each dot means that 100 posts are scanned.
### Rate Limiting
### Getting posts takes too long.
- You can press *Ctrl+C* to interrupt it and start downloading.
The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases.
### How do I open self post files?
- Self posts are held at reddit as styled with markdown. So, the script downloads them as they are in order not to lose their stylings.
However, there is a [great Chrome extension](https://chrome.google.com/webstore/detail/markdown-viewer/ckkdlimhmcjmikdlpkmbgfkaikojcbjk) for viewing Markdown files with its styling. Install it and open the files with [Chrome](https://www.google.com/intl/tr/chrome/).
To this end, the BDFR will sleep for a time before retrying the download, giving the remote server time to "rest". This is done in 60 second increments. For example, if a rate-limiting-related error is given, the BDFR will sleep for 60 seconds before retrying. Then, if the same type of error occurs, it will sleep for another 120 seconds, then 180 seconds, and so on.
However, they are basically text files. You can also view them with any text editor such as Notepad on Windows, gedit on Linux or Text Editor on MacOS.
The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case.
## List of currently supported sources
- Direct links (links leading to a file)
- Erome
- Gfycat
- Gif Delivery Network
- Imgur
- Reddit Galleries
- Reddit Text Posts
- Reddit Videos
- Redgifs
- YouTube
## Contributing
If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information.
When reporting any issues or interacting with the developers, please follow the [Code of Conduct](docs/CODE_OF_CONDUCT.md).

View File

@@ -1 +0,0 @@
theme: jekyll-theme-cayman

118
bdfr/__main__.py Normal file
View File

@@ -0,0 +1,118 @@
#!/usr/bin/env python3
import logging
import sys
import click
from bdfr.archiver import Archiver
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
logger = logging.getLogger()
_common_options = [
click.argument('directory', type=str),
click.option('--config', type=str, default=None),
click.option('-v', '--verbose', default=None, count=True),
click.option('-l', '--link', multiple=True, default=None, type=str),
click.option('-s', '--subreddit', multiple=True, default=None, type=str),
click.option('-m', '--multireddit', multiple=True, default=None, type=str),
click.option('-L', '--limit', default=None, type=int),
click.option('--authenticate', is_flag=True, default=None),
click.option('--submitted', is_flag=True, default=None),
click.option('--upvoted', is_flag=True, default=None),
click.option('--saved', is_flag=True, default=None),
click.option('--search', default=None, type=str),
click.option('-u', '--user', type=str, default=None),
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
'controversial', 'rising', 'relevance')), default=None),
]
def _add_common_options(func):
for opt in _common_options:
func = opt(func)
return func
@click.group()
def cli():
pass
@cli.command('download')
@click.option('--exclude-id', default=None, multiple=True)
@click.option('--exclude-id-file', default=None, multiple=True)
@click.option('--file-scheme', default=None, type=str)
@click.option('--folder-scheme', default=None, type=str)
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--max-wait-time', type=int, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
@click.option('--skip', default=None, multiple=True)
@click.option('--skip-domain', default=None, multiple=True)
@click.option('--skip-subreddit', default=None, multiple=True)
@_add_common_options
@click.pass_context
def cli_download(context: click.Context, **_):
config = Configuration()
config.process_click_arguments(context)
setup_logging(config.verbose)
try:
reddit_downloader = RedditDownloader(config)
reddit_downloader.download()
except Exception:
logger.exception('Downloader exited unexpectedly')
raise
else:
logger.info('Program complete')
@cli.command('archive')
@_add_common_options
@click.option('--all-comments', is_flag=True, default=None)
@click.option('-f,', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None)
@click.pass_context
def cli_archive(context: click.Context, **_):
config = Configuration()
config.process_click_arguments(context)
setup_logging(config.verbose)
try:
reddit_archiver = Archiver(config)
reddit_archiver.download()
except Exception:
logger.exception('Downloader exited unexpectedly')
raise
else:
logger.info('Program complete')
def setup_logging(verbosity: int):
class StreamExceptionFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
result = not (record.levelno == logging.ERROR and record.exc_info)
return result
logger.setLevel(1)
stream = logging.StreamHandler(sys.stdout)
stream.addFilter(StreamExceptionFilter())
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
stream.setFormatter(formatter)
logger.addHandler(stream)
if verbosity <= 0:
stream.setLevel(logging.INFO)
elif verbosity == 1:
stream.setLevel(logging.DEBUG)
else:
stream.setLevel(9)
logging.getLogger('praw').setLevel(logging.CRITICAL)
logging.getLogger('prawcore').setLevel(logging.CRITICAL)
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
if __name__ == '__main__':
cli()

View File

@@ -0,0 +1,2 @@
#!/usr/bin/env python3
# coding=utf-8

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# coding=utf-8
from abc import ABC, abstractmethod
from praw.models import Comment, Submission
class BaseArchiveEntry(ABC):
def __init__(self, source: (Comment, Submission)):
self.source = source
self.post_details: dict = {}
@abstractmethod
def compile(self) -> dict:
raise NotImplementedError
@staticmethod
def _convert_comment_to_dict(in_comment: Comment) -> dict:
out_dict = {
'author': in_comment.author.name if in_comment.author else 'DELETED',
'id': in_comment.id,
'score': in_comment.score,
'subreddit': in_comment.subreddit.display_name,
'submission': in_comment.submission.id,
'stickied': in_comment.stickied,
'body': in_comment.body,
'is_submitter': in_comment.is_submitter,
'created_utc': in_comment.created_utc,
'parent_id': in_comment.parent_id,
'replies': [],
}
in_comment.replies.replace_more(0)
for reply in in_comment.replies:
out_dict['replies'].append(BaseArchiveEntry._convert_comment_to_dict(reply))
return out_dict

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import praw.models
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
logger = logging.getLogger(__name__)
class CommentArchiveEntry(BaseArchiveEntry):
def __init__(self, comment: praw.models.Comment):
super(CommentArchiveEntry, self).__init__(comment)
def compile(self) -> dict:
self.source.refresh()
self.post_details = self._convert_comment_to_dict(self.source)
self.post_details['submission_title'] = self.source.submission.title
return self.post_details

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import praw.models
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
logger = logging.getLogger(__name__)
class SubmissionArchiveEntry(BaseArchiveEntry):
def __init__(self, submission: praw.models.Submission):
super(SubmissionArchiveEntry, self).__init__(submission)
def compile(self) -> dict:
comments = self._get_comments()
self._get_post_details()
out = self.post_details
out['comments'] = comments
return out
def _get_post_details(self):
self.post_details = {
'title': self.source.title,
'name': self.source.name,
'url': self.source.url,
'selftext': self.source.selftext,
'score': self.source.score,
'upvote_ratio': self.source.upvote_ratio,
'permalink': self.source.permalink,
'id': self.source.id,
'author': self.source.author.name if self.source.author else 'DELETED',
'link_flair_text': self.source.link_flair_text,
'num_comments': self.source.num_comments,
'over_18': self.source.over_18,
'created_utc': self.source.created_utc,
}
def _get_comments(self) -> list[dict]:
logger.debug(f'Retrieving full comment tree for submission {self.source.id}')
comments = []
self.source.comments.replace_more(0)
for top_level_comment in self.source.comments:
comments.append(self._convert_comment_to_dict(top_level_comment))
return comments

96
bdfr/archiver.py Normal file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
# coding=utf-8
import json
import logging
import re
from typing import Iterator
import dict2xml
import praw.models
import yaml
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
from bdfr.exceptions import ArchiverError
from bdfr.resource import Resource
logger = logging.getLogger(__name__)
class Archiver(RedditDownloader):
def __init__(self, args: Configuration):
super(Archiver, self).__init__(args)
def download(self):
for generator in self.reddit_lists:
for submission in generator:
logger.debug(f'Attempting to archive submission {submission.id}')
self._write_entry(submission)
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
supplied_submissions = []
for sub_id in self.args.link:
if len(sub_id) == 6:
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
elif re.match(r'^\w{7}$', sub_id):
supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
else:
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
return [supplied_submissions]
def _get_user_data(self) -> list[Iterator]:
results = super(Archiver, self)._get_user_data()
if self.args.user and self.args.all_comments:
sort = self._determine_sort_function()
logger.debug(f'Retrieving comments of user {self.args.user}')
results.append(sort(self.reddit_instance.redditor(self.args.user).comments, limit=self.args.limit))
return results
@staticmethod
def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Comment)) -> BaseArchiveEntry:
if isinstance(praw_item, praw.models.Submission):
return SubmissionArchiveEntry(praw_item)
elif isinstance(praw_item, praw.models.Comment):
return CommentArchiveEntry(praw_item)
else:
raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}')
def _write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
archive_entry = self._pull_lever_entry_factory(praw_item)
if self.args.format == 'json':
self._write_entry_json(archive_entry)
elif self.args.format == 'xml':
self._write_entry_xml(archive_entry)
elif self.args.format == 'yaml':
self._write_entry_yaml(archive_entry)
else:
raise ArchiverError(f'Unknown format {self.args.format} given')
logger.info(f'Record for entry item {praw_item.id} written to disk')
def _write_entry_json(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.json')
content = json.dumps(entry.compile())
self._write_content_to_disk(resource, content)
def _write_entry_xml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.xml')
content = dict2xml.dict2xml(entry.compile(), wrap='root')
self._write_content_to_disk(resource, content)
def _write_entry_yaml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.yaml')
content = yaml.dump(entry.compile())
self._write_content_to_disk(resource, content)
def _write_content_to_disk(self, resource: Resource, content: str):
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
file_path.parent.mkdir(exist_ok=True, parents=True)
with open(file_path, 'w') as file:
logger.debug(
f'Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}'
f' format at {file_path}')
file.write(content)

47
bdfr/configuration.py Normal file
View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
# coding=utf-8
from argparse import Namespace
from typing import Optional
import click
class Configuration(Namespace):
def __init__(self):
super(Configuration, self).__init__()
self.authenticate = False
self.config = None
self.directory: str = '.'
self.exclude_id = []
self.exclude_id_file = []
self.limit: Optional[int] = None
self.link: list[str] = []
self.max_wait_time = None
self.multireddit: list[str] = []
self.no_dupes: bool = False
self.saved: bool = False
self.search: Optional[str] = None
self.search_existing: bool = False
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
self.folder_scheme: str = '{SUBREDDIT}'
self.skip: list[str] = []
self.skip_domain: list[str] = []
self.skip_subreddit: list[str] = []
self.sort: str = 'hot'
self.submitted: bool = False
self.subreddit: list[str] = []
self.time: str = 'all'
self.upvoted: bool = False
self.user: Optional[str] = None
self.verbose: int = 0
self.make_hard_links = False
# Archiver-specific options
self.format = 'json'
self.all_comments = False
def process_click_arguments(self, context: click.Context):
for arg_key in context.params.keys():
if arg_key in vars(self) and context.params[arg_key] is not None:
vars(self)[arg_key] = context.params[arg_key]

6
bdfr/default_config.cfg Normal file
View File

@@ -0,0 +1,6 @@
[DEFAULT]
client_id = U-6gk4ZCh3IeNQ
client_secret = 7CZHY6AmKweZME5s50SfDGylaPg
scopes = identity, history, read, save
backup_log_count = 3
max_wait_time = 120

44
bdfr/download_filter.py Normal file
View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import re
logger = logging.getLogger(__name__)
class DownloadFilter:
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
self.excluded_extensions = excluded_extensions
self.excluded_domains = excluded_domains
def check_url(self, url: str) -> bool:
"""Return whether a URL is allowed or not"""
if not self._check_extension(url):
return False
elif not self._check_domain(url):
return False
else:
return True
def _check_extension(self, url: str) -> bool:
if not self.excluded_extensions:
return True
combined_extensions = '|'.join(self.excluded_extensions)
pattern = re.compile(r'.*({})$'.format(combined_extensions))
if re.match(pattern, url):
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
return False
else:
return True
def _check_domain(self, url: str) -> bool:
if not self.excluded_domains:
return True
combined_domains = '|'.join(self.excluded_domains)
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
if re.match(pattern, url):
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
return False
else:
return True

448
bdfr/downloader.py Normal file
View File

@@ -0,0 +1,448 @@
#!/usr/bin/env python3
# coding=utf-8
import configparser
import hashlib
import importlib.resources
import logging
import logging.handlers
import os
import re
import shutil
import socket
from datetime import datetime
from enum import Enum, auto
from multiprocessing import Pool
from pathlib import Path
from typing import Iterator
import appdirs
import praw
import praw.exceptions
import praw.models
import prawcore
import bdfr.exceptions as errors
from bdfr.configuration import Configuration
from bdfr.download_filter import DownloadFilter
from bdfr.file_name_formatter import FileNameFormatter
from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.download_factory import DownloadFactory
logger = logging.getLogger(__name__)
def _calc_hash(existing_file: Path):
with open(existing_file, 'rb') as file:
file_hash = hashlib.md5(file.read()).hexdigest()
return existing_file, file_hash
class RedditTypes:
class SortType(Enum):
CONTROVERSIAL = auto()
HOT = auto()
NEW = auto()
RELEVENCE = auto()
RISING = auto()
TOP = auto()
class TimeType(Enum):
ALL = 'all'
DAY = 'day'
HOUR = 'hour'
MONTH = 'month'
WEEK = 'week'
YEAR = 'year'
class RedditDownloader:
def __init__(self, args: Configuration):
self.args = args
self.config_directories = appdirs.AppDirs('bdfr', 'BDFR')
self.run_time = datetime.now().isoformat()
self._setup_internal_objects()
self.reddit_lists = self._retrieve_reddit_lists()
def _setup_internal_objects(self):
self._determine_directories()
self._load_config()
self._create_file_logger()
self._read_config()
self.download_filter = self._create_download_filter()
logger.log(9, 'Created download filter')
self.time_filter = self._create_time_filter()
logger.log(9, 'Created time filter')
self.sort_filter = self._create_sort_filter()
logger.log(9, 'Created sort filter')
self.file_name_formatter = self._create_file_name_formatter()
logger.log(9, 'Create file name formatter')
self._create_reddit_instance()
self._resolve_user_name()
self.excluded_submission_ids = self._read_excluded_ids()
if self.args.search_existing:
self.master_hash_list = self.scan_existing_files(self.download_directory)
else:
self.master_hash_list = {}
self.authenticator = self._create_authenticator()
logger.log(9, 'Created site authenticator')
self.args.skip_subreddit = self._split_args_input(self.args.skip_subreddit)
self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit])
def _read_config(self):
"""Read any cfg values that need to be processed"""
if self.args.max_wait_time is None:
if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'):
self.cfg_parser.set('DEFAULT', 'max_wait_time', '120')
logger.log(9, 'Wrote default download wait time download to config file')
self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time')
logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds')
# Update config on disk
with open(self.config_location, 'w') as file:
self.cfg_parser.write(file)
def _create_reddit_instance(self):
if self.args.authenticate:
logger.debug('Using authenticated Reddit instance')
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
logger.log(9, 'Commencing OAuth2 authentication')
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
scopes = OAuth2Authenticator.split_scopes(scopes)
oauth2_authenticator = OAuth2Authenticator(
scopes,
self.cfg_parser.get('DEFAULT', 'client_id'),
self.cfg_parser.get('DEFAULT', 'client_secret'),
)
token = oauth2_authenticator.retrieve_new_token()
self.cfg_parser['DEFAULT']['user_token'] = token
with open(self.config_location, 'w') as file:
self.cfg_parser.write(file, True)
token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location)
self.authenticated = True
self.reddit_instance = praw.Reddit(
client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
user_agent=socket.gethostname(),
token_manager=token_manager,
)
else:
logger.debug('Using unauthenticated Reddit instance')
self.authenticated = False
self.reddit_instance = praw.Reddit(
client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
user_agent=socket.gethostname(),
)
def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
master_list = []
master_list.extend(self._get_subreddits())
logger.log(9, 'Retrieved subreddits')
master_list.extend(self._get_multireddits())
logger.log(9, 'Retrieved multireddits')
master_list.extend(self._get_user_data())
logger.log(9, 'Retrieved user data')
master_list.extend(self._get_submissions_from_link())
logger.log(9, 'Retrieved submissions for given links')
return master_list
def _determine_directories(self):
self.download_directory = Path(self.args.directory).resolve().expanduser()
self.config_directory = Path(self.config_directories.user_config_dir)
self.download_directory.mkdir(exist_ok=True, parents=True)
self.config_directory.mkdir(exist_ok=True, parents=True)
def _load_config(self):
self.cfg_parser = configparser.ConfigParser()
if self.args.config:
if (cfg_path := Path(self.args.config)).exists():
self.cfg_parser.read(cfg_path)
self.config_location = cfg_path
return
possible_paths = [
Path('./config.cfg'),
Path('./default_config.cfg'),
Path(self.config_directory, 'config.cfg'),
Path(self.config_directory, 'default_config.cfg'),
]
self.config_location = None
for path in possible_paths:
if path.resolve().expanduser().exists():
self.config_location = path
logger.debug(f'Loading configuration from {path}')
break
if not self.config_location:
self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0]
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
if not self.config_location:
raise errors.BulkDownloaderException('Could not find a configuration file to load')
self.cfg_parser.read(self.config_location)
def _create_file_logger(self):
main_logger = logging.getLogger()
log_path = Path(self.config_directory, 'log_output.txt')
backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3)
file_handler = logging.handlers.RotatingFileHandler(
log_path,
mode='a',
backupCount=backup_count,
)
if log_path.exists():
file_handler.doRollover()
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
file_handler.setFormatter(formatter)
file_handler.setLevel(0)
main_logger.addHandler(file_handler)
@staticmethod
def _sanitise_subreddit_name(subreddit: str) -> str:
pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)(?:/)?$')
match = re.match(pattern, subreddit)
if not match:
raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}')
return match.group(1)
@staticmethod
def _split_args_input(entries: list[str]) -> set[str]:
all_entries = []
split_pattern = re.compile(r'[,;]\s?')
for entry in entries:
results = re.split(split_pattern, entry)
all_entries.extend([RedditDownloader._sanitise_subreddit_name(name) for name in results])
return set(all_entries)
def _get_subreddits(self) -> list[praw.models.ListingGenerator]:
if self.args.subreddit:
out = []
sort_function = self._determine_sort_function()
for reddit in self._split_args_input(self.args.subreddit):
try:
reddit = self.reddit_instance.subreddit(reddit)
if self.args.search:
out.append(reddit.search(
self.args.search,
sort=self.sort_filter.name.lower(),
limit=self.args.limit,
time_filter=self.time_filter.value,
))
logger.debug(
f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"')
else:
out.append(self._create_filtered_listing_generator(reddit))
logger.debug(f'Added submissions from subreddit {reddit}')
except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e:
logger.error(f'Failed to get submissions for subreddit {reddit}: {e}')
return out
else:
return []
def _resolve_user_name(self):
if self.args.user == 'me':
if self.authenticated:
self.args.user = self.reddit_instance.user.me().name
logger.log(9, f'Resolved user to {self.args.user}')
else:
self.args.user = None
logger.warning('To use "me" as a user, an authenticated Reddit instance must be used')
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
supplied_submissions = []
for sub_id in self.args.link:
if len(sub_id) == 6:
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
else:
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
return [supplied_submissions]
def _determine_sort_function(self):
if self.sort_filter is RedditTypes.SortType.NEW:
sort_function = praw.models.Subreddit.new
elif self.sort_filter is RedditTypes.SortType.RISING:
sort_function = praw.models.Subreddit.rising
elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL:
sort_function = praw.models.Subreddit.controversial
elif self.sort_filter is RedditTypes.SortType.TOP:
sort_function = praw.models.Subreddit.top
else:
sort_function = praw.models.Subreddit.hot
return sort_function
def _get_multireddits(self) -> list[Iterator]:
if self.args.multireddit:
out = []
for multi in self._split_args_input(self.args.multireddit):
try:
multi = self.reddit_instance.multireddit(self.args.user, multi)
if not multi.subreddits:
raise errors.BulkDownloaderException
out.append(self._create_filtered_listing_generator(multi))
logger.debug(f'Added submissions from multireddit {multi}')
except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e:
logger.error(f'Failed to get submissions for multireddit {multi}: {e}')
return out
else:
return []
def _create_filtered_listing_generator(self, reddit_source) -> Iterator:
sort_function = self._determine_sort_function()
if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL):
return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value)
else:
return sort_function(reddit_source, limit=self.args.limit)
def _get_user_data(self) -> list[Iterator]:
if any([self.args.submitted, self.args.upvoted, self.args.saved]):
if self.args.user:
if not self._check_user_existence(self.args.user):
logger.error(f'User {self.args.user} does not exist')
return []
generators = []
if self.args.submitted:
logger.debug(f'Retrieving submitted posts of user {self.args.user}')
generators.append(self._create_filtered_listing_generator(
self.reddit_instance.redditor(self.args.user).submissions,
))
if not self.authenticated and any((self.args.upvoted, self.args.saved)):
logger.warning('Accessing user lists requires authentication')
else:
if self.args.upvoted:
logger.debug(f'Retrieving upvoted posts of user {self.args.user}')
generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit))
if self.args.saved:
logger.debug(f'Retrieving saved posts of user {self.args.user}')
generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit))
return generators
else:
logger.warning('A user must be supplied to download user data')
return []
else:
return []
def _check_user_existence(self, name: str) -> bool:
user = self.reddit_instance.redditor(name=name)
try:
if not user.id:
return False
except prawcore.exceptions.NotFound:
return False
return True
def _create_file_name_formatter(self) -> FileNameFormatter:
return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme)
def _create_time_filter(self) -> RedditTypes.TimeType:
try:
return RedditTypes.TimeType[self.args.time.upper()]
except (KeyError, AttributeError):
return RedditTypes.TimeType.ALL
def _create_sort_filter(self) -> RedditTypes.SortType:
try:
return RedditTypes.SortType[self.args.sort.upper()]
except (KeyError, AttributeError):
return RedditTypes.SortType.HOT
def _create_download_filter(self) -> DownloadFilter:
return DownloadFilter(self.args.skip, self.args.skip_domain)
def _create_authenticator(self) -> SiteAuthenticator:
return SiteAuthenticator(self.cfg_parser)
def download(self):
for generator in self.reddit_lists:
for submission in generator:
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
continue
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
else:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
def _download_submission(self, submission: praw.models.Submission):
if not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
if not self.download_filter.check_url(submission.url):
logger.debug(f'Download filter removed submission {submission.id} with URL {submission.url}')
return
try:
downloader_class = DownloadFactory.pull_lever(submission.url)
downloader = downloader_class(submission)
logger.debug(f'Using {downloader_class.__name__} with url {submission.url}')
except errors.NotADownloadableLinkError as e:
logger.error(f'Could not download submission {submission.id}: {e}')
return
try:
content = downloader.find_resources(self.authenticator)
except errors.SiteDownloaderError as e:
logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}')
return
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
if destination.exists():
logger.debug(f'File {destination} already exists, continuing')
else:
try:
res.download(self.args.max_wait_time)
except errors.BulkDownloaderException as e:
logger.error(
f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}')
return
resource_hash = res.hash.hexdigest()
destination.parent.mkdir(parents=True, exist_ok=True)
if resource_hash in self.master_hash_list:
if self.args.no_dupes:
logger.info(
f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere')
return
elif self.args.make_hard_links:
self.master_hash_list[resource_hash].link_to(destination)
logger.info(
f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}')
return
with open(destination, 'wb') as file:
file.write(res.content)
logger.debug(f'Written file to {destination}')
self.master_hash_list[resource_hash] = destination
logger.debug(f'Hash added to master list: {resource_hash}')
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
@staticmethod
def scan_existing_files(directory: Path) -> dict[str, Path]:
files = []
for (dirpath, dirnames, filenames) in os.walk(directory):
files.extend([Path(dirpath, file) for file in filenames])
logger.info(f'Calculating hashes for {len(files)} files')
pool = Pool(15)
results = pool.map(_calc_hash, files)
pool.close()
hash_list = {res[1]: res[0] for res in results}
return hash_list
def _read_excluded_ids(self) -> set[str]:
out = []
out.extend(self.args.exclude_id)
for id_file in self.args.exclude_id_file:
id_file = Path(id_file).resolve().expanduser()
if not id_file.exists():
logger.warning(f'ID exclusion file at {id_file} does not exist')
continue
with open(id_file, 'r') as file:
for line in file:
out.append(line.strip())
return set(out)

28
bdfr/exceptions.py Normal file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env
class BulkDownloaderException(Exception):
pass
class RedditUserError(BulkDownloaderException):
pass
class RedditAuthenticationError(RedditUserError):
pass
class ArchiverError(BulkDownloaderException):
pass
class SiteDownloaderError(BulkDownloaderException):
pass
class NotADownloadableLinkError(SiteDownloaderError):
pass
class ResourceNotFound(SiteDownloaderError):
pass

176
bdfr/file_name_formatter.py Normal file
View File

@@ -0,0 +1,176 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import platform
import re
from pathlib import Path
from typing import Optional
from praw.models import Comment, Submission
from bdfr.exceptions import BulkDownloaderException
from bdfr.resource import Resource
logger = logging.getLogger(__name__)
class FileNameFormatter:
key_terms = (
'date',
'flair',
'postid',
'redditor',
'subreddit',
'title',
'upvotes',
)
def __init__(self, file_format_string: str, directory_format_string: str):
if not self.validate_string(file_format_string):
raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
self.file_format_string = file_format_string
self.directory_format_string: list[str] = directory_format_string.split('/')
@staticmethod
def _format_name(submission: (Comment, Submission), format_string: str) -> str:
if isinstance(submission, Submission):
attributes = FileNameFormatter._generate_name_dict_from_submission(submission)
elif isinstance(submission, Comment):
attributes = FileNameFormatter._generate_name_dict_from_comment(submission)
else:
raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')
result = format_string
for key in attributes.keys():
if re.search(fr'(?i).*{{{key}}}.*', result):
key_value = str(attributes.get(key, 'unknown'))
key_value = FileNameFormatter._convert_unicode_escapes(key_value)
key_value = key_value.replace('\\', '\\\\')
result = re.sub(fr'(?i){{{key}}}', key_value, result)
result = result.replace('/', '')
if platform.system() == 'Windows':
result = FileNameFormatter._format_for_windows(result)
return result
@staticmethod
def _convert_unicode_escapes(in_string: str) -> str:
pattern = re.compile(r'(\\u\d{4})')
matches = re.search(pattern, in_string)
if matches:
for match in matches.groups():
converted_match = bytes(match, 'utf-8').decode('unicode-escape')
in_string = in_string.replace(match, converted_match)
return in_string
@staticmethod
def _generate_name_dict_from_submission(submission: Submission) -> dict:
submission_attributes = {
'title': submission.title,
'subreddit': submission.subreddit.display_name,
'redditor': submission.author.name if submission.author else 'DELETED',
'postid': submission.id,
'upvotes': submission.score,
'flair': submission.link_flair_text,
'date': submission.created_utc
}
return submission_attributes
@staticmethod
def _generate_name_dict_from_comment(comment: Comment) -> dict:
comment_attributes = {
'title': comment.submission.title,
'subreddit': comment.subreddit.display_name,
'redditor': comment.author.name if comment.author else 'DELETED',
'postid': comment.id,
'upvotes': comment.score,
'flair': '',
'date': comment.created_utc,
}
return comment_attributes
def format_path(
self,
resource: Resource,
destination_directory: Path,
index: Optional[int] = None,
) -> Path:
subfolder = Path(
destination_directory,
*[self._format_name(resource.source_submission, part) for part in self.directory_format_string]
)
index = f'_{str(index)}' if index else ''
if not resource.extension:
raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
ending = index + resource.extension
file_name = str(self._format_name(resource.source_submission, self.file_format_string))
file_name = self._limit_file_name_length(file_name, ending)
try:
file_path = Path(subfolder, file_name)
except TypeError:
raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
return file_path
@staticmethod
def _limit_file_name_length(filename: str, ending: str) -> str:
possible_id = re.search(r'((?:_\w{6})?$)', filename)
if possible_id:
ending = possible_id.group(1) + ending
filename = filename[:possible_id.start()]
max_length_chars = 255 - len(ending)
max_length_bytes = 255 - len(ending.encode('utf-8'))
while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes:
filename = filename[:-1]
return filename + ending
def format_resource_paths(
self,
resources: list[Resource],
destination_directory: Path,
) -> list[tuple[Path, Resource]]:
out = []
if len(resources) == 1:
try:
out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
except BulkDownloaderException as e:
logger.error(f'Could not generate file path for resource {resources[0].url}: {e}')
logger.exception('Could not generate file path')
else:
for i, res in enumerate(resources, start=1):
logger.log(9, f'Formatting filename with index {i}')
try:
out.append((self.format_path(res, destination_directory, i), res))
except BulkDownloaderException as e:
logger.error(f'Could not generate file path for resource {res.url}: {e}')
logger.exception('Could not generate file path')
return out
@staticmethod
def validate_string(test_string: str) -> bool:
if not test_string:
return False
result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])
if result:
if 'POSTID' not in test_string:
logger.warning(
'Some files might not be downloaded due to name conflicts as filenames are'
' not guaranteed to be be unique without {POSTID}')
return True
else:
return False
@staticmethod
def _format_for_windows(input_string: str) -> str:
invalid_characters = r'<>:"\/|?*'
for char in invalid_characters:
input_string = input_string.replace(char, '')
input_string = FileNameFormatter._strip_emojis(input_string)
return input_string
@staticmethod
def _strip_emojis(input_string: str) -> str:
result = input_string.encode('ascii', errors='ignore').decode('utf-8')
return result

107
bdfr/oauth2.py Normal file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python3
# coding=utf-8
import configparser
import logging
import random
import re
import socket
from pathlib import Path
import praw
import requests
from bdfr.exceptions import BulkDownloaderException, RedditAuthenticationError
logger = logging.getLogger(__name__)
class OAuth2Authenticator:
def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str):
self._check_scopes(wanted_scopes)
self.scopes = wanted_scopes
self.client_id = client_id
self.client_secret = client_secret
@staticmethod
def _check_scopes(wanted_scopes: set[str]):
response = requests.get('https://www.reddit.com/api/v1/scopes.json',
headers={'User-Agent': 'fetch-scopes test'})
known_scopes = [scope for scope, data in response.json().items()]
known_scopes.append('*')
for scope in wanted_scopes:
if scope not in known_scopes:
raise BulkDownloaderException(f'Scope {scope} is not known to reddit')
@staticmethod
def split_scopes(scopes: str) -> set[str]:
scopes = re.split(r'[,: ]+', scopes)
return set(scopes)
def retrieve_new_token(self) -> str:
reddit = praw.Reddit(
redirect_uri='http://localhost:7634',
user_agent='obtain_refresh_token for BDFR',
client_id=self.client_id,
client_secret=self.client_secret)
state = str(random.randint(0, 65000))
url = reddit.auth.url(self.scopes, state, 'permanent')
logger.warning('Authentication action required before the program can proceed')
logger.warning(f'Authenticate at {url}')
client = self.receive_connection()
data = client.recv(1024).decode('utf-8')
param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&')
params = {key: value for (key, value) in [token.split('=') for token in param_tokens]}
if state != params['state']:
self.send_message(client)
raise RedditAuthenticationError(f'State mismatch in OAuth2. Expected: {state} Received: {params["state"]}')
elif 'error' in params:
self.send_message(client)
raise RedditAuthenticationError(f'Error in OAuth2: {params["error"]}')
self.send_message(client, "<script>alert('You can go back to terminal window now.')</script>")
refresh_token = reddit.auth.authorize(params["code"])
return refresh_token
@staticmethod
def receive_connection() -> socket.socket:
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(('localhost', 7634))
logger.log(9, 'Server listening on localhost:7634')
server.listen(1)
client = server.accept()[0]
server.close()
logger.log(9, 'Server closed')
return client
@staticmethod
def send_message(client: socket.socket, message: str):
client.send(f'HTTP/1.1 200 OK\r\n\r\n{message}'.encode('utf-8'))
client.close()
class OAuth2TokenManager(praw.reddit.BaseTokenManager):
def __init__(self, config: configparser.ConfigParser, config_location: Path):
super(OAuth2TokenManager, self).__init__()
self.config = config
self.config_location = config_location
def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer):
if authorizer.refresh_token is None:
if self.config.has_option('DEFAULT', 'user_token'):
authorizer.refresh_token = self.config.get('DEFAULT', 'user_token')
logger.log(9, 'Loaded OAuth2 token for authoriser')
else:
raise RedditAuthenticationError('No auth token loaded in configuration')
def post_refresh_callback(self, authorizer: praw.reddit.Authorizer):
self.config.set('DEFAULT', 'user_token', authorizer.refresh_token)
with open(self.config_location, 'w') as file:
self.config.write(file, True)
logger.log(9, f'Written OAuth2 token from authoriser to {self.config_location}')

72
bdfr/resource.py Normal file
View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
# coding=utf-8
import hashlib
import logging
import re
import time
from typing import Optional
import urllib.parse
import _hashlib
import requests
from praw.models import Submission
from bdfr.exceptions import BulkDownloaderException
logger = logging.getLogger(__name__)
class Resource:
def __init__(self, source_submission: Submission, url: str, extension: str = None):
self.source_submission = source_submission
self.content: Optional[bytes] = None
self.url = url
self.hash: Optional[_hashlib.HASH] = None
self.extension = extension
if not self.extension:
self.extension = self._determine_extension()
@staticmethod
def retry_download(url: str, max_wait_time: int) -> Optional[bytes]:
wait_time = 60
try:
response = requests.get(url)
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
return response.content
elif response.status_code in (408, 429):
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
else:
raise BulkDownloaderException(
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
except requests.exceptions.ConnectionError as e:
logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}')
time.sleep(wait_time)
if wait_time < max_wait_time:
return Resource.retry_download(url, max_wait_time)
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
raise
def download(self, max_wait_time: int):
if not self.content:
try:
content = self.retry_download(self.url, max_wait_time)
except requests.exceptions.ConnectionError as e:
raise BulkDownloaderException(f'Could not download resource: {e}')
except BulkDownloaderException:
raise
if content:
self.content = content
if not self.hash and self.content:
self.create_hash()
def create_hash(self):
self.hash = hashlib.md5(self.content)
def _determine_extension(self) -> Optional[str]:
extension_pattern = re.compile(r'.*(\..{3,5})$')
stripped_url = urllib.parse.urlsplit(self.url).path
match = re.search(extension_pattern, stripped_url)
if match:
return match.group(1)

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env python3
# coding=utf-8
import configparser
class SiteAuthenticator:
def __init__(self, cfg: configparser.ConfigParser):
self.imgur_authentication = None

View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
from abc import ABC, abstractmethod
from typing import Optional
import requests
from praw.models import Submission
from bdfr.exceptions import ResourceNotFound
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
logger = logging.getLogger(__name__)
class BaseDownloader(ABC):
def __init__(self, post: Submission, typical_extension: Optional[str] = None):
self.post = post
self.typical_extension = typical_extension
@abstractmethod
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
"""Return list of all un-downloaded Resources from submission"""
raise NotImplementedError
@staticmethod
def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
res = requests.get(url, cookies=cookies, headers=headers)
if res.status_code != 200:
raise ResourceNotFound(f'Server responded with {res.status_code} to {url}')
return res

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env python3
from typing import Optional
from praw.models import Submission
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.resource import Resource
from bdfr.site_downloaders.base_downloader import BaseDownloader
class Direct(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
return [Resource(self.post, self.post.url)]

View File

@@ -0,0 +1,59 @@
#!/usr/bin/env python3
# coding=utf-8
import re
import urllib.parse
from typing import Type
from bdfr.exceptions import NotADownloadableLinkError
from bdfr.site_downloaders.base_downloader import BaseDownloader
from bdfr.site_downloaders.direct import Direct
from bdfr.site_downloaders.erome import Erome
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vreddit import VReddit
from bdfr.site_downloaders.youtube import Youtube
class DownloadFactory:
@staticmethod
def pull_lever(url: str) -> Type[BaseDownloader]:
sanitised_url = DownloadFactory._sanitise_url(url)
if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url):
return Imgur
elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url):
return Direct
elif re.match(r'erome\.com.*', sanitised_url):
return Erome
elif re.match(r'reddit\.com/gallery/.*', sanitised_url):
return Gallery
elif re.match(r'gfycat\.', sanitised_url):
return Gfycat
elif re.match(r'gifdeliverynetwork', sanitised_url):
return GifDeliveryNetwork
elif re.match(r'(m\.)?imgur.*', sanitised_url):
return Imgur
elif re.match(r'redgifs.com', sanitised_url):
return Redgifs
elif re.match(r'reddit\.com/r/', sanitised_url):
return SelfPost
elif re.match(r'v\.redd\.it', sanitised_url):
return VReddit
elif re.match(r'(m\.)?youtu\.?be', sanitised_url):
return Youtube
elif re.match(r'i\.redd\.it.*', sanitised_url):
return Direct
else:
raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
@staticmethod
def _sanitise_url(url: str) -> str:
beginning_regex = re.compile(r'\s*(www\.?)?')
split_url = urllib.parse.urlsplit(url)
split_url = split_url.netloc + split_url.path
split_url = re.sub(beginning_regex, '', split_url)
return split_url

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python3
import logging
import re
from typing import Optional
import bs4
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class Erome(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
links = self._get_links(self.post.url)
if not links:
raise SiteDownloaderError('Erome parser could not find any links')
out = []
for link in links:
if not re.match(r'https?://.*', link):
link = 'https://' + link
out.append(Resource(self.post, link))
return out
@staticmethod
def _get_links(url: str) -> set[str]:
page = Erome.retrieve_url(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
front_images = soup.find_all('img', attrs={'class': 'lasyload'})
out = [im.get('data-src') for im in front_images]
videos = soup.find_all('source')
out.extend([vid.get('src') for vid in videos])
return set(out)

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
import logging
import re
from typing import Optional
import bs4
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class Gallery(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url)
if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls]
@staticmethod
def _get_links(url: str) -> list[str]:
resource_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = Gallery.retrieve_url(url, headers=resource_headers)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
links = [link.get('href') for link in links]
return links

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python3
import json
import re
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
class Gfycat(GifDeliveryNetwork):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
return super().find_resources(authenticator)
@staticmethod
def _get_link(url: str) -> str:
gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1)
url = 'https://gfycat.com/' + gfycat_id
response = Gfycat.retrieve_url(url)
if 'gifdeliverynetwork' in response.url:
return GifDeliveryNetwork._get_link(url)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
try:
out = json.loads(content.contents[0])['video']['contentUrl']
except (IndexError, KeyError) as e:
raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}')
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Did not receive valid JSON data: {e}')
return out

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
class GifDeliveryNetwork(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
media_url = self._get_link(self.post.url)
return [Resource(self.post, media_url, '.mp4')]
@staticmethod
def _get_link(url: str) -> str:
page = GifDeliveryNetwork.retrieve_url(url)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
try:
out = content['src']
if not out:
raise KeyError
except KeyError:
raise SiteDownloaderError('Could not find source link')
return out

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
import json
import re
from typing import Optional
import bs4
from praw.models import Submission
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
class Imgur(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
self.raw_data = {}
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
self.raw_data = self._get_data(self.post.url)
out = []
if 'album_images' in self.raw_data:
images = self.raw_data['album_images']
for image in images['images']:
out.append(self._download_image(image))
else:
out.append(self._download_image(self.raw_data))
return out
def _download_image(self, image: dict) -> Resource:
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
return Resource(self.post, image_url)
@staticmethod
def _get_data(link: str) -> dict:
if re.match(r'.*\.gifv$', link):
link = link.replace('i.imgur', 'imgur')
link = link.rstrip('.gifv')
res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'})
soup = bs4.BeautifulSoup(res.text, 'html.parser')
scripts = soup.find_all('script', attrs={'type': 'text/javascript'})
scripts = [script.string.replace('\n', '') for script in scripts if script.string]
script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
if len(chosen_script) != 1:
raise SiteDownloaderError(f'Could not read page source from {link}')
chosen_script = chosen_script[0]
outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
inner_regex = re.compile(r'image\s*:(.*),\s*group')
try:
image_dict = re.search(outer_regex, chosen_script).group(1)
image_dict = re.search(inner_regex, image_dict).group(1)
except AttributeError:
raise SiteDownloaderError(f'Could not find image dictionary in page source')
try:
image_dict = json.loads(image_dict)
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}')
return image_dict
@staticmethod
def _validate_extension(extension_suffix: str) -> str:
possible_extensions = ('.jpg', '.png', '.mp4', '.gif')
selection = [ext for ext in possible_extensions if ext == extension_suffix]
if len(selection) == 1:
return selection[0]
else:
raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python3
import json
import re
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
class Redgifs(GifDeliveryNetwork):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
return super().find_resources(authenticator)
@staticmethod
def _get_link(url: str) -> str:
try:
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
except AttributeError:
raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}')
url = 'https://redgifs.com/watch/' + redgif_id
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
}
page = Redgifs.retrieve_url(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
if content is None:
raise SiteDownloaderError('Could not read the page source')
try:
out = json.loads(content.contents[0])['video']['contentUrl']
except (IndexError, KeyError):
raise SiteDownloaderError('Failed to find JSON data in page')
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Received data was not valid JSON: {e}')
return out

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import logging
from typing import Optional
from praw.models import Submission
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class SelfPost(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = Resource(self.post, self.post.url, '.txt')
out.content = self.export_to_string().encode('utf-8')
out.create_hash()
return [out]
def export_to_string(self) -> str:
"""Self posts are formatted here"""
content = ("## ["
+ self.post.fullname
+ "]("
+ self.post.url
+ ")\n"
+ self.post.selftext
+ "\n\n---\n\n"
+ "submitted to [r/"
+ self.post.subreddit.title
+ "](https://www.reddit.com/r/"
+ self.post.subreddit.title
+ ") by [u/"
+ (self.post.author.name if self.post.author else "DELETED")
+ "](https://www.reddit.com/user/"
+ (self.post.author.name if self.post.author else "DELETED")
+ ")")
return content

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
import logging
from typing import Optional
from praw.models import Submission
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.youtube import Youtube
logger = logging.getLogger(__name__)
class VReddit(Youtube):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = super()._download_video({})
return [out]

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env python3
import logging
import tempfile
from pathlib import Path
from typing import Optional
import youtube_dl
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class Youtube(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
ytdl_options = {
'format': 'best',
'playlistend': 1,
'nooverwrites': True,
}
out = self._download_video(ytdl_options)
return [out]
def _download_video(self, ytdl_options: dict) -> Resource:
ytdl_options['quiet'] = True
with tempfile.TemporaryDirectory() as temp_dir:
download_path = Path(temp_dir).resolve()
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
try:
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
ydl.download([self.post.url])
except youtube_dl.DownloadError as e:
raise SiteDownloaderError(f'Youtube download failed: {e}')
downloaded_file = list(download_path.iterdir())[0]
extension = downloaded_file.suffix
with open(downloaded_file, 'rb') as file:
content = file.read()
out = Resource(self.post, self.post.url, extension)
out.content = content
out.create_hash()
return out

0
bdfr/tests/__init__.py Normal file
View File

View File

@@ -0,0 +1,2 @@
#!/usr/bin/env python3
# coding=utf-8

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_comment_id', 'expected_dict'), (
('gstd4hk', {
'author': 'james_pic',
'subreddit': 'Python',
'submission': 'mgi4op',
'submission_title': '76% Faster CPython',
}),
))
def test_get_comment_details(test_comment_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
comment = reddit_instance.comment(id=test_comment_id)
test_entry = CommentArchiveEntry(comment)
result = test_entry.compile()
assert all([result.get(key) == expected_dict[key] for key in expected_dict.keys()])
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_comment_id', 'expected_min_comments'), (
('gstd4hk', 4),
('gsvyste', 3),
('gsxnvvb', 5),
))
def test_get_comment_replies(test_comment_id: str, expected_min_comments: int, reddit_instance: praw.Reddit):
comment = reddit_instance.comment(id=test_comment_id)
test_entry = CommentArchiveEntry(comment)
result = test_entry.compile()
assert len(result.get('replies')) >= expected_min_comments

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'min_comments'), (
('m3reby', 27),
))
def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
test_archive_entry = SubmissionArchiveEntry(test_submission)
results = test_archive_entry._get_comments()
assert len(results) >= min_comments
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), (
('m3reby', {
'author': 'sinjen-tos',
'id': 'm3reby',
'link_flair_text': 'image',
}),
('m3kua3', {'author': 'DELETED'}),
))
def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
test_archive_entry = SubmissionArchiveEntry(test_submission)
test_archive_entry._get_post_details()
assert all([test_archive_entry.post_details.get(key) == expected_dict[key] for key in expected_dict.keys()])

34
bdfr/tests/conftest.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
# coding=utf-8
import configparser
import socket
from pathlib import Path
import praw
import pytest
from bdfr.oauth2 import OAuth2TokenManager
@pytest.fixture(scope='session')
def reddit_instance():
rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test')
return rd
@pytest.fixture(scope='session')
def authenticated_reddit_instance():
test_config_path = Path('test_config.cfg')
if not test_config_path.exists():
pytest.skip('Refresh token must be provided to authenticate with OAuth2')
cfg_parser = configparser.ConfigParser()
cfg_parser.read(test_config_path)
if not cfg_parser.has_option('DEFAULT', 'user_token'):
pytest.skip('Refresh token must be provided to authenticate with OAuth2')
token_manager = OAuth2TokenManager(cfg_parser, test_config_path)
reddit_instance = praw.Reddit(client_id=cfg_parser.get('DEFAULT', 'client_id'),
client_secret=cfg_parser.get('DEFAULT', 'client_secret'),
user_agent=socket.gethostname(),
token_manager=token_manager)
return reddit_instance

View File

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.direct import Direct
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4', '48f9bd4dbec1556d7838885612b13b39'),
('https://giant.gfycat.com/DazzlingSilkyIguana.mp4', '808941b48fc1e28713d36dd7ed9dc648'),
))
def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock()
mock_submission.url = test_url
test_site = Direct(mock_submission)
resources = test_site.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.exceptions import NotADownloadableLinkError
from bdfr.site_downloaders.base_downloader import BaseDownloader
from bdfr.site_downloaders.direct import Direct
from bdfr.site_downloaders.download_factory import DownloadFactory
from bdfr.site_downloaders.erome import Erome
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vreddit import VReddit
from bdfr.site_downloaders.youtube import Youtube
@pytest.mark.parametrize(('test_submission_url', 'expected_class'), (
('https://v.redd.it/9z1dnk3xr5k61', VReddit),
('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life'
'_in_anything_but_comfort/', SelfPost),
('https://i.imgur.com/bZx1SJQ.jpg', Direct),
('https://i.redd.it/affyv0axd5k61.png', Direct),
('https://imgur.com/3ls94yv.jpeg', Direct),
('https://i.imgur.com/BuzvZwb.gifv', Imgur),
('https://imgur.com/BuzvZwb.gifv', Imgur),
('https://i.imgur.com/6fNdLst.gif', Direct),
('https://imgur.com/a/MkxAzeg', Imgur),
('https://www.reddit.com/gallery/lu93m7', Gallery),
('https://gfycat.com/concretecheerfulfinwhale', Gfycat),
('https://www.erome.com/a/NWGw0F09', Erome),
('https://youtube.com/watch?v=Gv8Wz74FjVA', Youtube),
('https://redgifs.com/watch/courageousimpeccablecanvasback', Redgifs),
('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', GifDeliveryNetwork),
('https://youtu.be/DevfjHOhuFc', Youtube),
('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube),
('https://i.imgur.com/3SKrQfK.jpg?1', Direct),
('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct),
('https://m.imgur.com/a/py3RW0j', Imgur),
))
def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit):
result = DownloadFactory.pull_lever(test_submission_url)
assert result is expected_class
@pytest.mark.parametrize('test_url', (
'random.com',
'bad',
'https://www.google.com/',
'https://www.google.com',
'https://www.google.com/test',
'https://www.google.com/test/',
))
def test_factory_lever_bad(test_url: str):
with pytest.raises(NotADownloadableLinkError):
DownloadFactory.pull_lever(test_url)
@pytest.mark.parametrize(('test_url', 'expected'), (
('www.test.com/test.png', 'test.com/test.png'),
('www.test.com/test.png?test_value=random', 'test.com/test.png'),
('https://youtube.com/watch?v=Gv8Wz74FjVA', 'youtube.com/watch'),
('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'),
))
def test_sanitise_urll(test_url: str, expected: str):
result = DownloadFactory._sanitise_url(test_url)
assert result == expected

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import MagicMock
import pytest
from bdfr.site_downloaders.erome import Erome
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_urls'), (
('https://www.erome.com/a/vqtPuLXh', (
'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
)),
('https://www.erome.com/a/ORhX0FZz', (
'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
)),
))
def test_get_link(test_url: str, expected_urls: tuple[str]):
result = Erome. _get_links(test_url)
assert set(result) == set(expected_urls)
@pytest.mark.online
@pytest.mark.slow
@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
('https://www.erome.com/a/vqtPuLXh', {
'5da2a8d60d87bed279431fdec8e7d72f'
}),
('https://www.erome.com/i/ItASD33e', {
'b0d73fedc9ce6995c2f2c4fdb6f11eff'
}),
('https://www.erome.com/a/lGrcFxmb', {
'0e98f9f527a911dcedde4f846bb5b69f',
'25696ae364750a5303fc7d7dc78b35c1',
'63775689f438bd393cde7db6d46187de',
'a1abf398cfd4ef9cfaf093ceb10c746a',
'bd9e1a4ea5ef0d6ba47fb90e337c2d14'
}),
))
def test_download_resource(test_url: str, expected_hashes: tuple[str]):
# Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash
# will change back and forth randomly
mock_submission = MagicMock()
mock_submission.url = test_url
test_site = Erome(mock_submission)
resources = test_site.find_resources()
[res.download(120) for res in resources]
resource_hashes = [res.hash.hexdigest() for res in resources]
assert len(resource_hashes) == len(expected_hashes)

View File

@@ -0,0 +1,60 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.site_downloaders.gallery import Gallery
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://www.reddit.com/gallery/m6lvrh', {
'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8',
'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805',
'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76',
'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444'
}),
('https://www.reddit.com/gallery/ljyy27', {
'https://preview.redd.it/04vxj25uqih61.png?width=92&'
'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4',
'https://preview.redd.it/0fnx83kpqih61.png?width=241&'
'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a',
'https://preview.redd.it/7zkmr1wqqih61.png?width=237&'
'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0',
'https://preview.redd.it/u37k5gxrqih61.png?width=443&'
'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862'
}),
))
def test_gallery_get_links(test_url: str, expected: set[str]):
results = Gallery._get_links(test_url)
assert set(results) == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), (
('m6lvrh', {
'6c8a892ae8066cbe119218bcaac731e1',
'93ce177f8cb7994906795f4615114d13',
'9a293adf19354f14582608cf22124574',
'b73e2c3daee02f99404644ea02f1ae65'
}),
('ljyy27', {
'1bc38bed88f9c4770e22a37122d5c941',
'2539a92b78f3968a069df2dffe2279f9',
'37dea50281c219b905e46edeefc1a18d',
'ec4924cf40549728dcf53dd40bc7a73c'
}),
))
def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
gallery = Gallery(test_submission)
results = gallery.find_resources()
[res.download(120) for res in results]
hashes = [res.hash.hexdigest() for res in results]
assert set(hashes) == expected_hashes

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.gfycat import Gfycat
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_url'), (
('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'),
('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'),
('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'),
))
def test_get_link(test_url: str, expected_url: str):
result = Gfycat._get_link(test_url)
assert result == expected_url
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://gfycat.com/definitivecaninecrayfish', '48f9bd4dbec1556d7838885612b13b39'),
('https://gfycat.com/dazzlingsilkyiguana', '808941b48fc1e28713d36dd7ed9dc648'),
))
def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock()
mock_submission.url = test_url
test_site = Gfycat(mock_submission)
resources = test_site.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer',
'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'),
('https://www.gifdeliverynetwork.com/maturenexthippopotamus',
'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'),
))
def test_get_link(test_url: str, expected: str):
result = GifDeliveryNetwork._get_link(test_url)
assert result == expected
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'),
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'),
))
def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock()
mock_submission.url = test_url
test_site = GifDeliveryNetwork(mock_submission)
resources = test_site.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_downloaders.imgur import Imgur
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_gen_dict', 'expected_image_dict'), (
(
'https://imgur.com/a/xWZsDDP',
{'num_images': '1', 'id': 'xWZsDDP', 'hash': 'xWZsDDP'},
[
{'hash': 'ypa8YfS', 'title': '', 'ext': '.png', 'animated': False}
]
),
(
'https://imgur.com/gallery/IjJJdlC',
{'num_images': 1, 'id': 384898055, 'hash': 'IjJJdlC'},
[
{'hash': 'CbbScDt',
'description': 'watch when he gets it',
'ext': '.gif',
'animated': True,
'has_sound': False
}
],
),
(
'https://imgur.com/a/dcc84Gt',
{'num_images': '4', 'id': 'dcc84Gt', 'hash': 'dcc84Gt'},
[
{'hash': 'ylx0Kle', 'ext': '.jpg', 'title': ''},
{'hash': 'TdYfKbK', 'ext': '.jpg', 'title': ''},
{'hash': 'pCxGbe8', 'ext': '.jpg', 'title': ''},
{'hash': 'TSAkikk', 'ext': '.jpg', 'title': ''},
]
),
(
'https://m.imgur.com/a/py3RW0j',
{'num_images': '1', 'id': 'py3RW0j', 'hash': 'py3RW0j', },
[
{'hash': 'K24eQmK', 'has_sound': False, 'ext': '.jpg'}
],
),
))
def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_dict: list[dict]):
result = Imgur._get_data(test_url)
assert all([result.get(key) == expected_gen_dict[key] for key in expected_gen_dict.keys()])
# Check if all the keys from the test dict are correct in at least one of the album entries
assert any([all([image.get(key) == image_dict[key] for key in image_dict.keys()])
for image_dict in expected_image_dict for image in result['album_images']['images']])
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_image_dict'), (
(
'https://i.imgur.com/dLk3FGY.gifv',
{'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True}
),
(
'https://imgur.com/BuzvZwb.gifv',
{
'hash': 'BuzvZwb',
'title': '',
'description': 'Akron Glass Works',
'animated': True,
'mimetype': 'video/mp4'
},
),
))
def test_get_data_gif(test_url: str, expected_image_dict: dict):
result = Imgur._get_data(test_url)
assert all([result.get(key) == expected_image_dict[key] for key in expected_image_dict.keys()])
@pytest.mark.parametrize('test_extension', (
'.gif',
'.png',
'.jpg',
'.mp4'
))
def test_imgur_extension_validation_good(test_extension: str):
result = Imgur._validate_extension(test_extension)
assert result == test_extension
@pytest.mark.parametrize('test_extension', (
'.jpeg',
'bad',
'.avi',
'.test',
'.flac',
))
def test_imgur_extension_validation_bad(test_extension: str):
with pytest.raises(SiteDownloaderError):
Imgur._validate_extension(test_extension)
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
(
'https://imgur.com/a/xWZsDDP',
('f551d6e6b0fef2ce909767338612e31b',)
),
(
'https://imgur.com/gallery/IjJJdlC',
('7227d4312a9779b74302724a0cfa9081',),
),
(
'https://imgur.com/a/dcc84Gt',
(
'cf1158e1de5c3c8993461383b96610cf',
'28d6b791a2daef8aa363bf5a3198535d',
'248ef8f2a6d03eeb2a80d0123dbaf9b6',
'029c475ce01b58fdf1269d8771d33913',
),
),
))
def test_find_resources(test_url: str, expected_hashes: list[str]):
mock_download = Mock()
mock_download.url = test_url
downloader = Imgur(mock_download)
results = downloader.find_resources()
assert all([isinstance(res, Resource) for res in results])
[res.download(120) for res in results]
hashes = set([res.hash.hexdigest() for res in results])
assert len(results) == len(expected_hashes)
assert hashes == set(expected_hashes)

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.redgifs import Redgifs
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://redgifs.com/watch/frighteningvictorioussalamander',
'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'),
('https://redgifs.com/watch/springgreendecisivetaruca',
'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'),
))
def test_get_link(test_url: str, expected: str):
result = Redgifs._get_link(test_url)
assert result == expected
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'),
('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'),
))
def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock()
mock_submission.url = test_url
test_site = Redgifs(mock_submission)
resources = test_site.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.self_post import SelfPost
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), (
('ltmivt', '7d2c9e4e989e5cf2dca2e55a06b1c4f6'),
('ltoaan', '221606386b614d6780c2585a59bd333f'),
('d3sc8o', 'c1ff2b6bd3f6b91381dcd18dfc4ca35f'),
))
def test_find_resource(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit):
submission = reddit_instance.submission(id=test_submission_id)
downloader = SelfPost(submission)
results = downloader.find_resources()
assert len(results) == 1
assert isinstance(results[0], Resource)
assert results[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python3
# coding=utf-8
import praw
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.vreddit import VReddit
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id'), (
('lu8l8g'),
))
def test_find_resources(test_submission_id: str, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
downloader = VReddit(test_submission)
resources = downloader.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].content is not None

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import MagicMock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.youtube import Youtube
@pytest.mark.online
@pytest.mark.slow
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'),
('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'),
))
def test_find_resources(test_url: str, expected_hash: str):
test_submission = MagicMock()
test_submission.url = test_url
downloader = Youtube(test_submission)
resources = downloader.find_resources()
assert len(resources) == 1
assert isinstance(resources[0], Resource)
resources[0].download(120)
assert resources[0].hash.hexdigest() == expected_hash

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
# coding=utf-8
from pathlib import Path
from unittest.mock import MagicMock
import praw
import pytest
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
from bdfr.archiver import Archiver
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
'm3reby',
))
def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
archiver_mock = MagicMock()
test_path = Path(tmp_path, 'test.json')
test_submission = reddit_instance.submission(id=test_submission_id)
archiver_mock.file_name_formatter.format_path.return_value = test_path
test_entry = SubmissionArchiveEntry(test_submission)
Archiver._write_entry_json(archiver_mock, test_entry)
archiver_mock._write_content_to_disk.assert_called_once()
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
'm3reby',
))
def test_write_submission_xml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
archiver_mock = MagicMock()
test_path = Path(tmp_path, 'test.xml')
test_submission = reddit_instance.submission(id=test_submission_id)
archiver_mock.file_name_formatter.format_path.return_value = test_path
test_entry = SubmissionArchiveEntry(test_submission)
Archiver._write_entry_xml(archiver_mock, test_entry)
archiver_mock._write_content_to_disk.assert_called_once()
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
'm3reby',
))
def test_write_submission_yaml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit):
archiver_mock = MagicMock()
archiver_mock.download_directory = tmp_path
test_path = Path(tmp_path, 'test.yaml')
test_submission = reddit_instance.submission(id=test_submission_id)
archiver_mock.file_name_formatter.format_path.return_value = test_path
test_entry = SubmissionArchiveEntry(test_submission)
Archiver._write_entry_yaml(archiver_mock, test_entry)
archiver_mock._write_content_to_disk.assert_called_once()

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import MagicMock
import pytest
from bdfr.configuration import Configuration
@pytest.mark.parametrize('arg_dict', (
{'directory': 'test_dir'},
{
'directory': 'test_dir',
'no_dupes': True,
},
))
def test_process_click_context(arg_dict: dict):
test_config = Configuration()
test_context = MagicMock()
test_context.params = arg_dict
test_config.process_click_arguments(test_context)
test_config = vars(test_config)
assert all([test_config[arg] == arg_dict[arg] for arg in arg_dict.keys()])

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python3
# coding=utf-8
import pytest
from bdfr.download_filter import DownloadFilter
@pytest.fixture()
def download_filter() -> DownloadFilter:
return DownloadFilter(['mp4', 'mp3'], ['test.com', 'reddit.com'])
@pytest.mark.parametrize(('test_url', 'expected'), (
('test.mp4', False),
('test.avi', True),
('test.random.mp3', False),
))
def test_filter_extension(test_url: str, expected: bool, download_filter: DownloadFilter):
result = download_filter._check_extension(test_url)
assert result == expected
@pytest.mark.parametrize(('test_url', 'expected'), (
('test.mp4', True),
('http://reddit.com/test.mp4', False),
('http://reddit.com/test.gif', False),
('https://www.example.com/test.mp4', True),
('https://www.example.com/test.png', True),
))
def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadFilter):
result = download_filter._check_domain(test_url)
assert result == expected
@pytest.mark.parametrize(('test_url', 'expected'), (
('test.mp4', False),
('test.gif', True),
('https://www.example.com/test.mp4', False),
('https://www.example.com/test.png', True),
('http://reddit.com/test.mp4', False),
('http://reddit.com/test.gif', False),
))
def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter):
result = download_filter.check_url(test_url)
assert result == expected
@pytest.mark.parametrize('test_url', (
'test.mp3',
'test.mp4',
'http://reddit.com/test.mp4',
't',
))
def test_filter_empty_filter(test_url: str):
download_filter = DownloadFilter()
result = download_filter.check_url(test_url)
assert result is True

View File

@@ -0,0 +1,460 @@
#!/usr/bin/env python3
# coding=utf-8
import re
from pathlib import Path
from typing import Iterator
from unittest.mock import MagicMock
import praw
import praw.models
import pytest
from bdfr.__main__ import setup_logging
from bdfr.configuration import Configuration
from bdfr.download_filter import DownloadFilter
from bdfr.downloader import RedditDownloader, RedditTypes
from bdfr.exceptions import BulkDownloaderException
from bdfr.file_name_formatter import FileNameFormatter
from bdfr.site_authenticator import SiteAuthenticator
@pytest.fixture()
def args() -> Configuration:
args = Configuration()
return args
@pytest.fixture()
def downloader_mock(args: Configuration):
downloader_mock = MagicMock()
downloader_mock.args = args
downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
downloader_mock._split_args_input = RedditDownloader._split_args_input
downloader_mock.master_hash_list = {}
return downloader_mock
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]):
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) for res in results])
if result_limit is not None:
assert len(results) == result_limit
return results
def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock):
downloader_mock.args.directory = tmp_path / 'test'
downloader_mock.config_directories.user_config_dir = tmp_path
RedditDownloader._determine_directories(downloader_mock)
assert Path(tmp_path / 'test').exists()
@pytest.mark.parametrize(('skip_extensions', 'skip_domains'), (
([], []),
(['.test'], ['test.com'],),
))
def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock):
downloader_mock.args.skip = skip_extensions
downloader_mock.args.skip_domain = skip_domains
result = RedditDownloader._create_download_filter(downloader_mock)
assert isinstance(result, DownloadFilter)
assert result.excluded_domains == skip_domains
assert result.excluded_extensions == skip_extensions
@pytest.mark.parametrize(('test_time', 'expected'), (
('all', 'all'),
('hour', 'hour'),
('day', 'day'),
('week', 'week'),
('random', 'all'),
('', 'all'),
))
def test_create_time_filter(test_time: str, expected: str, downloader_mock: MagicMock):
downloader_mock.args.time = test_time
result = RedditDownloader._create_time_filter(downloader_mock)
assert isinstance(result, RedditTypes.TimeType)
assert result.name.lower() == expected
@pytest.mark.parametrize(('test_sort', 'expected'), (
('', 'hot'),
('hot', 'hot'),
('controversial', 'controversial'),
('new', 'new'),
))
def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: MagicMock):
downloader_mock.args.sort = test_sort
result = RedditDownloader._create_sort_filter(downloader_mock)
assert isinstance(result, RedditTypes.SortType)
assert result.name.lower() == expected
@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), (
('{POSTID}', '{SUBREDDIT}'),
('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'),
('{POSTID}', 'test'),
('{POSTID}', ''),
('{POSTID}', '{SUBREDDIT}/{REDDITOR}'),
))
def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock):
downloader_mock.args.file_scheme = test_file_scheme
downloader_mock.args.folder_scheme = test_folder_scheme
result = RedditDownloader._create_file_name_formatter(downloader_mock)
assert isinstance(result, FileNameFormatter)
assert result.file_format_string == test_file_scheme
assert result.directory_format_string == test_folder_scheme.split('/')
@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), (
('', ''),
('', '{SUBREDDIT}'),
('test', '{SUBREDDIT}'),
))
def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock):
downloader_mock.args.file_scheme = test_file_scheme
downloader_mock.args.folder_scheme = test_folder_scheme
with pytest.raises(BulkDownloaderException):
RedditDownloader._create_file_name_formatter(downloader_mock)
def test_create_authenticator(downloader_mock: MagicMock):
result = RedditDownloader._create_authenticator(downloader_mock)
assert isinstance(result, SiteAuthenticator)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_ids', (
('lvpf4l',),
('lvpf4l', 'lvqnsn'),
('lvpf4l', 'lvqnsn', 'lvl9kd'),
))
def test_get_submissions_from_link(
test_submission_ids: list[str],
reddit_instance: praw.Reddit,
downloader_mock: MagicMock):
downloader_mock.args.link = test_submission_ids
downloader_mock.reddit_instance = reddit_instance
results = RedditDownloader._get_submissions_from_link(downloader_mock)
assert all([isinstance(sub, praw.models.Submission) for res in results for sub in res])
assert len(results[0]) == len(test_submission_ids)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_subreddits', 'limit', 'sort_type', 'time_filter', 'max_expected_len'), (
(('Futurology',), 10, 'hot', 'all', 10),
(('Futurology', 'Mindustry, Python'), 10, 'hot', 'all', 30),
(('Futurology',), 20, 'hot', 'all', 20),
(('Futurology', 'Python'), 10, 'hot', 'all', 20),
(('Futurology',), 100, 'hot', 'all', 100),
(('Futurology',), 0, 'hot', 'all', 0),
(('Futurology',), 10, 'top', 'all', 10),
(('Futurology',), 10, 'top', 'week', 10),
(('Futurology',), 10, 'hot', 'week', 10),
))
def test_get_subreddit_normal(
test_subreddits: list[str],
limit: int,
sort_type: str,
time_filter: str,
max_expected_len: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
):
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.args.limit = limit
downloader_mock.args.sort = sort_type
downloader_mock.args.subreddit = test_subreddits
downloader_mock.reddit_instance = reddit_instance
downloader_mock.sort_filter = RedditDownloader._create_sort_filter(downloader_mock)
results = RedditDownloader._get_subreddits(downloader_mock)
test_subreddits = downloader_mock._split_args_input(test_subreddits)
results = [sub for res1 in results for sub in res1]
assert all([isinstance(res1, praw.models.Submission) for res1 in results])
assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), (
(('Python',), 'scraper', 10, 'all', 10),
(('Python',), '', 10, 'all', 10),
(('Python',), 'djsdsgewef', 10, 'all', 0),
(('Python',), 'scraper', 10, 'year', 10),
(('Python',), 'scraper', 10, 'hour', 1),
))
def test_get_subreddit_search(
test_subreddits: list[str],
search_term: str,
time_filter: str,
limit: int,
max_expected_len: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
):
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.args.limit = limit
downloader_mock.args.search = search_term
downloader_mock.args.subreddit = test_subreddits
downloader_mock.reddit_instance = reddit_instance
downloader_mock.sort_filter = RedditTypes.SortType.HOT
downloader_mock.args.time = time_filter
downloader_mock.time_filter = RedditDownloader._create_time_filter(downloader_mock)
results = RedditDownloader._get_subreddits(downloader_mock)
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) for res in results])
assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_user', 'test_multireddits', 'limit'), (
('helen_darten', ('cuteanimalpics',), 10),
('korfor', ('chess',), 100),
))
# Good sources at https://www.reddit.com/r/multihub/
def test_get_multireddits_public(
test_user: str,
test_multireddits: list[str],
limit: int,
reddit_instance: praw.Reddit,
downloader_mock: MagicMock,
):
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.sort_filter = RedditTypes.SortType.HOT
downloader_mock.args.limit = limit
downloader_mock.args.multireddit = test_multireddits
downloader_mock.args.user = test_user
downloader_mock.reddit_instance = reddit_instance
downloader_mock._create_filtered_listing_generator.return_value = \
RedditDownloader._create_filtered_listing_generator(
downloader_mock,
reddit_instance.multireddit(test_user, test_multireddits[0]),
)
results = RedditDownloader._get_multireddits(downloader_mock)
results = [sub for res in results for sub in res]
assert all([isinstance(res, praw.models.Submission) for res in results])
assert len(results) == limit
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_user', 'limit'), (
('danigirl3694', 10),
('danigirl3694', 50),
('CapitanHam', None),
))
def test_get_user_submissions(test_user: str, limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit):
downloader_mock.args.limit = limit
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.sort_filter = RedditTypes.SortType.HOT
downloader_mock.args.submitted = True
downloader_mock.args.user = test_user
downloader_mock.authenticated = False
downloader_mock.reddit_instance = reddit_instance
downloader_mock._create_filtered_listing_generator.return_value = \
RedditDownloader._create_filtered_listing_generator(
downloader_mock,
reddit_instance.redditor(test_user).submissions,
)
results = RedditDownloader._get_user_data(downloader_mock)
results = assert_all_results_are_submissions(limit, results)
assert all([res.author.name == test_user for res in results])
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.authenticated
@pytest.mark.parametrize('test_flag', (
'upvoted',
'saved',
))
def test_get_user_authenticated_lists(
test_flag: str,
downloader_mock: MagicMock,
authenticated_reddit_instance: praw.Reddit,
):
downloader_mock.args.__dict__[test_flag] = True
downloader_mock.reddit_instance = authenticated_reddit_instance
downloader_mock.args.user = 'me'
downloader_mock.args.limit = 10
downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot
downloader_mock.sort_filter = RedditTypes.SortType.HOT
RedditDownloader._resolve_user_name(downloader_mock)
results = RedditDownloader._get_user_data(downloader_mock)
assert_all_results_are_submissions(10, results)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_files_len'), (
('ljyy27', 4),
))
def test_download_submission(
test_submission_id: str,
expected_files_len: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path):
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
assert len(folder_contents) == expected_files_len
@pytest.mark.online
@pytest.mark.reddit
def test_download_submission_file_exists(
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch()
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
output = capsys.readouterr()
assert len(folder_contents) == 1
assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png already exists' in output.out
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'test_hash'), (
('m1hqw6', 'a912af8905ae468e0121e9940f797ad7'),
))
def test_download_submission_hash_exists(
test_submission_id: str,
test_hash: str,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.args.no_dupes = True
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
downloader_mock.master_hash_list = {test_hash: None}
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
output = capsys.readouterr()
assert len(folder_contents) == 0
assert re.search(r'Resource hash .*? downloaded elsewhere', output.out)
@pytest.mark.parametrize(('test_name', 'expected'), (
('Mindustry', 'Mindustry'),
('Futurology', 'Futurology'),
('r/Mindustry', 'Mindustry'),
('TrollXChromosomes', 'TrollXChromosomes'),
('r/TrollXChromosomes', 'TrollXChromosomes'),
('https://www.reddit.com/r/TrollXChromosomes/', 'TrollXChromosomes'),
('https://www.reddit.com/r/TrollXChromosomes', 'TrollXChromosomes'),
('https://www.reddit.com/r/Futurology/', 'Futurology'),
('https://www.reddit.com/r/Futurology', 'Futurology'),
))
def test_sanitise_subreddit_name(test_name: str, expected: str):
result = RedditDownloader._sanitise_subreddit_name(test_name)
assert result == expected
def test_search_existing_files():
results = RedditDownloader.scan_existing_files(Path('.'))
assert len(results.keys()) >= 40
@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), (
(['test1', 'test2', 'test3'], {'test1', 'test2', 'test3'}),
(['test1,test2', 'test3'], {'test1', 'test2', 'test3'}),
(['test1, test2', 'test3'], {'test1', 'test2', 'test3'}),
(['test1; test2', 'test3'], {'test1', 'test2', 'test3'}),
(['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'})
))
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
results = RedditDownloader._split_args_input(test_subreddit_entries)
assert results == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize('test_submission_id', (
'm1hqw6',
))
def test_mark_hard_link(
test_submission_id: str,
downloader_mock: MagicMock,
tmp_path: Path,
reddit_instance: praw.Reddit
):
downloader_mock.reddit_instance = reddit_instance
downloader_mock.args.make_hard_links = True
downloader_mock.download_directory = tmp_path
downloader_mock.args.folder_scheme = ''
downloader_mock.args.file_scheme = '{POSTID}'
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
original = Path(tmp_path, f'{test_submission_id}.png')
RedditDownloader._download_submission(downloader_mock, submission)
assert original.exists()
downloader_mock.args.file_scheme = 'test2_{POSTID}'
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
RedditDownloader._download_submission(downloader_mock, submission)
test_file_1_stats = original.stat()
test_file_2_inode = Path(tmp_path, f'test2_{test_submission_id}.png').stat().st_ino
assert test_file_1_stats.st_nlink == 2
assert test_file_1_stats.st_ino == test_file_2_inode
@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), (
(('aaaaaa',), (), 1),
(('aaaaaa',), ('aaaaaa',), 0),
((), ('aaaaaa',), 0),
(('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1),
))
def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock):
downloader_mock.excluded_submission_ids = test_excluded
test_submissions = []
for test_id in test_ids:
m = MagicMock()
m.id = test_id
test_submissions.append(m)
downloader_mock.reddit_lists = [test_submissions]
RedditDownloader.download(downloader_mock)
assert downloader_mock._download_submission.call_count == expected_len
def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
test_file = tmp_path / 'test.txt'
test_file.write_text('aaaaaa\nbbbbbb')
downloader_mock.args.exclude_id_file = [test_file]
results = RedditDownloader._read_excluded_ids(downloader_mock)
assert results == {'aaaaaa', 'bbbbbb'}

View File

@@ -0,0 +1,320 @@
#!/usr/bin/env python3
# coding=utf-8
from pathlib import Path
from typing import Optional
from unittest.mock import MagicMock
import praw.models
import pytest
from bdfr.file_name_formatter import FileNameFormatter
from bdfr.resource import Resource
@pytest.fixture()
def submission() -> MagicMock:
test = MagicMock()
test.title = 'name'
test.subreddit.display_name = 'randomreddit'
test.author.name = 'person'
test.id = '12345'
test.score = 1000
test.link_flair_text = 'test_flair'
test.created_utc = 123456789
test.__class__ = praw.models.Submission
return test
@pytest.fixture(scope='session')
def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission:
return reddit_instance.submission(id='lgilgt')
@pytest.mark.parametrize(('format_string', 'expected'), (
('{SUBREDDIT}', 'randomreddit'),
('{REDDITOR}', 'person'),
('{POSTID}', '12345'),
('{UPVOTES}', '1000'),
('{FLAIR}', 'test_flair'),
('{DATE}', '123456789'),
('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'),
('{RANDOM}', '{RANDOM}'),
))
def test_format_name_mock(format_string: str, expected: str, submission: MagicMock):
result = FileNameFormatter._format_name(submission, format_string)
assert result == expected
@pytest.mark.parametrize(('test_string', 'expected'), (
('', False),
('test', False),
('{POSTID}', True),
('POSTID', False),
('{POSTID}_test', True),
('test_{TITLE}', True),
('TITLE_POSTID', False),
))
def test_check_format_string_validity(test_string: str, expected: bool):
result = FileNameFormatter.validate_string(test_string)
assert result == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('format_string', 'expected'), (
('{SUBREDDIT}', 'Mindustry'),
('{REDDITOR}', 'Gamer_player_boi'),
('{POSTID}', 'lgilgt'),
('{FLAIR}', 'Art'),
('{SUBREDDIT}_{TITLE}', 'Mindustry_Toxopid that is NOT humane >:('),
('{REDDITOR}_{TITLE}_{POSTID}', 'Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt')
))
def test_format_name_real(format_string: str, expected: str, reddit_submission: praw.models.Submission):
result = FileNameFormatter._format_name(reddit_submission, format_string)
assert result == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), (
(
'{SUBREDDIT}',
'{POSTID}',
'test/Mindustry/lgilgt.png',
),
(
'{SUBREDDIT}',
'{TITLE}_{POSTID}',
'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt.png',
),
(
'{SUBREDDIT}',
'{REDDITOR}_{TITLE}_{POSTID}',
'test/Mindustry/Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt.png',
),
))
def test_format_full(
format_string_directory: str,
format_string_file: str,
expected: str,
reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
result = test_formatter.format_path(test_resource, Path('test'))
assert str(result) == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('format_string_directory', 'format_string_file'), (
('{SUBREDDIT}', '{POSTID}'),
('{SUBREDDIT}', '{UPVOTES}'),
('{SUBREDDIT}', '{UPVOTES}{POSTID}'),
))
def test_format_full_conform(
format_string_directory: str,
format_string_file: str,
reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
test_formatter.format_path(test_resource, Path('test'))
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'index', 'expected'), (
('{SUBREDDIT}', '{POSTID}', None, 'test/Mindustry/lgilgt.png'),
('{SUBREDDIT}', '{POSTID}', 1, 'test/Mindustry/lgilgt_1.png'),
('{SUBREDDIT}', '{POSTID}', 2, 'test/Mindustry/lgilgt_2.png'),
('{SUBREDDIT}', '{TITLE}_{POSTID}', 2, 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt_2.png'),
))
def test_format_full_with_index_suffix(
format_string_directory: str,
format_string_file: str,
index: Optional[int],
expected: str,
reddit_submission: praw.models.Submission,
):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
test_formatter = FileNameFormatter(format_string_file, format_string_directory)
result = test_formatter.format_path(test_resource, Path('test'), index)
assert str(result) == expected
def test_format_multiple_resources():
mocks = []
for i in range(1, 5):
new_mock = MagicMock()
new_mock.url = 'https://example.com/test.png'
new_mock.extension = '.png'
new_mock.source_submission.title = 'test'
new_mock.source_submission.__class__ = praw.models.Submission
mocks.append(new_mock)
test_formatter = FileNameFormatter('{TITLE}', '')
results = test_formatter.format_resource_paths(mocks, Path('.'))
results = set([str(res[0]) for res in results])
assert results == {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'}
@pytest.mark.parametrize(('test_filename', 'test_ending'), (
('A' * 300, '.png'),
('A' * 300, '_1.png'),
('a' * 300, '_1000.jpeg'),
('😍💕✨' * 100, '_1.png'),
))
def test_limit_filename_length(test_filename: str, test_ending: str):
result = FileNameFormatter._limit_file_name_length(test_filename, test_ending)
assert len(result) <= 255
assert len(result.encode('utf-8')) <= 255
assert isinstance(result, str)
@pytest.mark.parametrize(('test_filename', 'test_ending', 'expected_end'), (
('test_aaaaaa', '_1.png', 'test_aaaaaa_1.png'),
('test_aataaa', '_1.png', 'test_aataaa_1.png'),
('test_abcdef', '_1.png', 'test_abcdef_1.png'),
('test_aaaaaa', '.png', 'test_aaaaaa.png'),
('test', '_1.png', 'test_1.png'),
('test_m1hqw6', '_1.png', 'test_m1hqw6_1.png'),
('A' * 300 + '_bbbccc', '.png', '_bbbccc.png'),
('A' * 300 + '_bbbccc', '_1000.jpeg', '_bbbccc_1000.jpeg'),
('😍💕✨' * 100 + '_aaa1aa', '_1.png', '_aaa1aa_1.png'),
))
def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str):
result = FileNameFormatter._limit_file_name_length(test_filename, test_ending)
assert len(result) <= 255
assert len(result.encode('utf-8')) <= 255
assert isinstance(result, str)
assert result.endswith(expected_end)
def test_shorten_filenames(submission: MagicMock, tmp_path: Path):
submission.title = 'A' * 300
submission.author.name = 'test'
submission.subreddit.display_name = 'test'
submission.id = 'BBBBBB'
test_resource = Resource(submission, 'www.example.com/empty', '.jpeg')
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}')
result = test_formatter.format_path(test_resource, tmp_path)
result.parent.mkdir(parents=True)
result.touch()
@pytest.mark.parametrize(('test_string', 'expected'), (
('test', 'test'),
('test😍', 'test'),
('test.png', 'test.png'),
('test*', 'test'),
('test**', 'test'),
('test?*', 'test'),
('test_???.png', 'test_.png'),
('test_???😍.png', 'test_.png'),
))
def test_format_file_name_for_windows(test_string: str, expected: str):
result = FileNameFormatter._format_for_windows(test_string)
assert result == expected
@pytest.mark.parametrize(('test_string', 'expected'), (
('test', 'test'),
('test😍', 'test'),
('😍', ''),
))
def test_strip_emojies(test_string: str, expected: str):
result = FileNameFormatter._strip_emojis(test_string)
assert result == expected
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected'), (
('mfuteh', {
'title': 'Why Do Interviewers Ask Linked List Questions?',
'redditor': 'mjgardner',
}),
))
def test_generate_dict_for_submission(test_submission_id: str, expected: dict, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id)
result = FileNameFormatter._generate_name_dict_from_submission(test_submission)
assert all([result.get(key) == expected[key] for key in expected.keys()])
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_comment_id', 'expected'), (
('gsq0yuw', {
'title': 'Why Do Interviewers Ask Linked List Questions?',
'redditor': 'Doctor-Dapper',
'postid': 'gsq0yuw',
'flair': '',
}),
))
def test_generate_dict_for_comment(test_comment_id: str, expected: dict, reddit_instance: praw.Reddit):
test_comment = reddit_instance.comment(id=test_comment_id)
result = FileNameFormatter._generate_name_dict_from_comment(test_comment)
assert all([result.get(key) == expected[key] for key in expected.keys()])
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme', 'test_comment_id', 'expected_name'), (
('{POSTID}', '', 'gsoubde', 'gsoubde.json'),
('{REDDITOR}_{POSTID}', '', 'gsoubde', 'DELETED_gsoubde.json'),
))
def test_format_archive_entry_comment(
test_file_scheme: str,
test_folder_scheme: str,
test_comment_id: str,
expected_name: str,
tmp_path: Path,
reddit_instance: praw.Reddit,
):
test_comment = reddit_instance.comment(id=test_comment_id)
test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme)
test_entry = Resource(test_comment, '', '.json')
result = test_formatter.format_path(test_entry, tmp_path)
assert result.name == expected_name
@pytest.mark.parametrize(('test_folder_scheme', 'expected'), (
('{REDDITOR}/{SUBREDDIT}', 'person/randomreddit'),
('{POSTID}/{SUBREDDIT}/{REDDITOR}', '12345/randomreddit/person'),
))
def test_multilevel_folder_scheme(
test_folder_scheme: str,
expected: str,
tmp_path: Path,
submission: MagicMock,
):
test_formatter = FileNameFormatter('{POSTID}', test_folder_scheme)
test_resource = MagicMock()
test_resource.source_submission = submission
test_resource.extension = '.png'
result = test_formatter.format_path(test_resource, tmp_path)
result = result.relative_to(tmp_path)
assert str(result.parent) == expected
assert len(result.parents) == (len(expected.split('/')) + 1)
@pytest.mark.parametrize(('test_name_string', 'expected'), (
('test', 'test'),
('😍', '😍'),
('test😍', 'test😍'),
('test😍 ', 'test😍 '),
('test😍 \\u2019', 'test😍 '),
('Using that real good [1\\4]', 'Using that real good [1\\4]'),
))
def test_preserve_emojis(test_name_string: str, expected: str, submission: MagicMock):
submission.title = test_name_string
result = FileNameFormatter._format_name(submission, '{TITLE}')
assert result == expected
@pytest.mark.parametrize(('test_string', 'expected'), (
('test \\u2019', 'test '),
('My cat\\u2019s paws are so cute', 'My cats paws are so cute'),
))
def test_convert_unicode_escapes(test_string: str, expected: str):
result = FileNameFormatter._convert_unicode_escapes(test_string)
assert result == expected

View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
# coding=utf-8
import re
from pathlib import Path
import pytest
from click.testing import CliRunner
from bdfr.__main__ import cli
does_test_config_exist = Path('test_config.cfg').exists()
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-s', 'Mindustry', '-L', 1],
['-s', 'r/Mindustry', '-L', 1],
['-s', 'r/mindustry', '-L', 1],
['-s', 'mindustry', '-L', 1],
['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 1],
['-s', 'r/TrollXChromosomes/', '-L', 1],
['-s', 'TrollXChromosomes/', '-L', 1],
['-s', 'trollxchromosomes', '-L', 1],
['-s', 'trollxchromosomes,mindustry,python', '-L', 1],
['-s', 'trollxchromosomes, mindustry, python', '-L', 1],
['-s', 'trollxchromosomes', '-L', 1, '--time', 'day'],
['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'],
['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'],
['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'],
['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'],
['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'],
['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'],
))
def test_cli_download_subreddits(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Added submissions from subreddit ' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g'],
['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'],
['-l', 'm3hxzd'], # Really long title used to overflow filename limit
['-l', 'm3kua3'], # Has a deleted user
['-l', 'm5bqkf'], # Resource leading to a 404
))
def test_cli_download_links(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10],
['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--sort', 'rising'],
['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--time', 'week'],
['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--time', 'week', '--sort', 'rising'],
))
def test_cli_download_multireddit(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Added submissions from multireddit ' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'helen_darten', '-m', 'xxyyzzqwerty', '-L', 10],
))
def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Failed to get submissions for multireddit' in result.output
assert 'received 404 HTTP response' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.authenticated
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'me', '--upvoted', '--authenticate', '-L', 10],
['--user', 'me', '--saved', '--authenticate', '-L', 10],
['--user', 'me', '--submitted', '--authenticate', '-L', 10],
['--user', 'djnish', '--submitted', '-L', 10],
['--user', 'djnish', '--submitted', '-L', 10, '--time', 'month'],
['--user', 'djnish', '--submitted', '-L', 10, '--sort', 'controversial'],
))
def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Downloaded submission ' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.authenticated
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'me', '-L', 10, '--folder-scheme', ''],
))
def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'To use "me" as a user, an authenticated Reddit instance must be used' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'python', '-L', 10, '--search-existing'],
))
def test_cli_download_search_existing(test_args: list[str], tmp_path: Path):
Path(tmp_path, 'test.txt').touch()
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Calculating hashes for' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'],
))
def test_cli_download_download_filters(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Download filter removed submission' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'all', '-L', '100', '--sort', 'new'],
))
def test_cli_download_long(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'gstd4hk'],
['-l', 'm2601g'],
))
def test_cli_archive_single(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'Mindustry', '-L', 25],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'],
['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'],
['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'],
['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'],
))
def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'me', '--authenticate', '--all-comments', '-L', '10'],
))
def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--subreddit', 'all', '-L', 100],
['--subreddit', 'all', '-L', 100, '--sort', 'new'],
))
def test_cli_archive_long(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert re.search(r'Writing entry .*? to file in .*? format', result.output)
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--user', 'sdclhgsolgjeroij', '--submitted', '-L', 10],
['--user', 'me', '--upvoted', '-L', 10],
['--user', 'sdclhgsolgjeroij', '--upvoted', '-L', 10],
))
def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.slow
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--time', 'random'],
['--sort', 'random'],
))
def test_cli_download_hard_fail(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code != 0
def test_cli_download_use_default_config(tmp_path: Path):
runner = CliRunner()
test_args = ['download', '-vv', str(tmp_path)]
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g', '--exclude-id', 'm2601g'],
))
def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'in exclusion list' in result.output
assert 'Downloaded submission ' not in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g', '--skip-subreddit', 'trollxchromosomes'],
['-s', 'trollxchromosomes', '--skip-subreddit', 'trollxchromosomes', '-L', '3'],
))
def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'in skip list' in result.output
assert 'Downloaded submission ' not in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['--file-scheme', '{TITLE}'],
['--file-scheme', '{TITLE}_test_{SUBREDDIT}'],
))
def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Some files might not be downloaded due to name conflicts' in result.output

71
bdfr/tests/test_oauth2.py Normal file
View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
# coding=utf-8
import configparser
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from bdfr.exceptions import BulkDownloaderException
from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager
@pytest.fixture()
def example_config() -> configparser.ConfigParser:
out = configparser.ConfigParser()
config_dict = {'DEFAULT': {'user_token': 'example'}}
out.read_dict(config_dict)
return out
@pytest.mark.online
@pytest.mark.parametrize('test_scopes', (
{'history', },
{'history', 'creddits'},
{'account', 'flair'},
{'*', },
))
def test_check_scopes(test_scopes: set[str]):
OAuth2Authenticator._check_scopes(test_scopes)
@pytest.mark.parametrize(('test_scopes', 'expected'), (
('history', {'history', }),
('history creddits', {'history', 'creddits'}),
('history, creddits, account', {'history', 'creddits', 'account'}),
('history,creddits,account,flair', {'history', 'creddits', 'account', 'flair'}),
))
def test_split_scopes(test_scopes: str, expected: set[str]):
result = OAuth2Authenticator.split_scopes(test_scopes)
assert result == expected
@pytest.mark.online
@pytest.mark.parametrize('test_scopes', (
{'random', },
{'scope', 'another_scope'},
))
def test_check_scopes_bad(test_scopes: set[str]):
with pytest.raises(BulkDownloaderException):
OAuth2Authenticator._check_scopes(test_scopes)
def test_token_manager_read(example_config: configparser.ConfigParser):
mock_authoriser = MagicMock()
mock_authoriser.refresh_token = None
test_manager = OAuth2TokenManager(example_config, MagicMock())
test_manager.pre_refresh_callback(mock_authoriser)
assert mock_authoriser.refresh_token == example_config.get('DEFAULT', 'user_token')
def test_token_manager_write(example_config: configparser.ConfigParser, tmp_path: Path):
test_path = tmp_path / 'test.cfg'
mock_authoriser = MagicMock()
mock_authoriser.refresh_token = 'changed_token'
test_manager = OAuth2TokenManager(example_config, test_path)
test_manager.post_refresh_callback(mock_authoriser)
assert example_config.get('DEFAULT', 'user_token') == 'changed_token'
with open(test_path, 'r') as file:
file_contents = file.read()
assert 'user_token = changed_token' in file_contents

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import MagicMock
import pytest
from bdfr.resource import Resource
@pytest.mark.parametrize(('test_url', 'expected'), (
('test.png', '.png'),
('another.mp4', '.mp4'),
('test.jpeg', '.jpeg'),
('http://www.random.com/resource.png', '.png'),
('https://www.resource.com/test/example.jpg', '.jpg'),
('hard.png.mp4', '.mp4'),
('https://preview.redd.it/7zkmr1wqqih61.png?width=237&format=png&auto=webp&s=19de214e634cbcad99', '.png'),
('test.jpg#test', '.jpg'),
('test.jpg?width=247#test', '.jpg'),
('https://www.test.com/test/test2/example.png?random=test#thing', '.png'),
))
def test_resource_get_extension(test_url: str, expected: str):
test_resource = Resource(MagicMock(), test_url)
result = test_resource._determine_extension()
assert result == expected
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'),
))
def test_download_online_resource(test_url: str, expected_hash: str):
test_resource = Resource(MagicMock(), test_url)
test_resource.download(120)
assert test_resource.hash.hexdigest() == expected_hash

View File

@@ -1,17 +0,0 @@
version: "3"
services:
bdfr:
build:
context: .
dockerfile: ./Dockerfile
image: bdfr
container_name: bdfr
ports:
- "8080:8080"
- "7634:7634"
volumes:
- .:/bdfr:z
container_name: bdfr_container
network_mode: bridge

49
docs/ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,49 @@
# Architecture
When the project was rewritten for v2, the goal was to make the codebase easily extensible and much easier to read and modify. However, this document provides a step-by-step look through the process that the BDFR goes through, so that any prospective developers can more easily grasp the way the code works.
## Design Ethos
The BDFR is designed to be a stateless downloader. This means that the state of the program is forgotten between each run of the program. There are no central lists, databases, or indices, that the BDFR uses, only the actual files on disk. There are several advantages to this approach:
1. There is no chance of the database being corrupted or changed by something other than the BDFR, rendering the BDFR's "idea" of the archive wrong or incomplete.
2. Any information about the archive is contained by the archive itself i.e. for a list of all submission IDs in the archive, this can be extracted from the names of the files in said archive, assuming an appropriate naming scheme was used.
3. Archives can be merged, split, or editing without worrying about having to update a central database
4. There are no versioning issues between updates of the BDFR, where old version are stuck with a worse form of the database
5. An archive can be put on a USB, moved to another computer with possibly a very different BDFR version, and work completely fine
Another major part of the ethos of the design is DOTADIW, Do One Thing And Do It Well. It's a major part of Unix philosophy and states that each tool should have a well-defined, limited purpose. To this end, the BDFR is, as the name implies, a *downloader*. That is the scope of the tool. Managing the files downloaded can be for better-suited programs, since the BDFR is not a file manager. Nor the BDFR concern itself with how any of the data downloaded is displayed, changed, parsed, or analysed. This makes the BDFR suitable for data science-related tasks, archiving, personal downloads, or analysis of various Reddit sources as the BDFR is completely agnostic on how the data is used.
## The Download Process
The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class.
1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc.
2. The RedditDownloader scrapes raw submissions from Reddit via several methods relating to different sources. A source is defined as a single stream of submissions from a subreddit, multireddit, or user list.
3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct.
4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource.
5. This is returned to the RedditDownloader in the form of a Resource object. This holds the URL and some other information for the final resource.
6. The Resource is passed through the DownloadFilter instantiated in step 1.
7. The destination file name for the Resource is calculated. If it already exists, then the Resource will be discarded.
8. Here the actual data is downloaded to the Resource and a hash calculated which is used to find duplicates.
9. Only then is the Resource written to the disk.
This is the step-by-step process that the BDFR goes through to download a Reddit post.
## Adding another Supported Site
This is one of the easiest changes to do with the code. First, any new class must inherit from the BaseDownloader class which provided an abstract parent to implement. However, take note of the other classes as well. Many downloaders can inherit from one another instead of just the BaseDownloader. For example, the VReddit class, used for downloading video from Reddit, inherits almost all of its code from the YouTube class. **Minimise code duplication wherever possible**.
Once the downloader class has been written **and tests added** for it as well, then the regex string for the site's URLs can be added to the DownloaderFactory. Then additional tests must be added for the DownloadFactory to ensure that the appropriate classes are called when the right URLs are passed to the factory.
## Adding Other Features
For a fundamentally different form of execution path for the program, such as the difference between the `archive` and `download` commands, it is best to inherit from the RedditDownloader class and override or add functionality as needed.

76
docs/CODE_OF_CONDUCT.md Normal file
View File

@@ -0,0 +1,76 @@
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, gender identity and expression, level of experience,
education, socio-economic status, nationality, personal appearance, race,
religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team via Discord. All complaints will
be reviewed and investigated and will result in a response that is deemed
necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an
incident. Further details of specific enforcement policies may be posted
separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org

118
docs/CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,118 @@
# Contributing
When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised.
**Please don't open an issue on GitHub** unless you are reporting a bug or proposing a feature. For questions, there is a discussion tab on the repository's GitHub page where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue.
All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful.
## Opening an Issue
**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue.
### Bugs
When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug.
If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug.
### Feature requests
In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers.
## Pull Requests
Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually writing any code.
Once you have done both of these, the below list shows the path that should be followed when writing a PR.
1. If an issue does not already exist, open one that will relate to the PR.
2. Ensure that any changes fit into the architecture specified above.
3. Ensure that you have written tests that cover the new code.
4. Ensure that no existing tests fail, unless there is a good reason for them to do so.
5. If needed, update any documentation with changes.
6. Open a pull request that references the relevant issue.
7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here.
Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR.
## Preparing the environment for development
Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this.
BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it.
To install the program, clone the repository and run pip inside the project's root directory:
```bash
$ git clone https://github.com/aliparlakci/bulk-downloader-for-reddit.git
$ cd ./bulk-downloader-for-reddit
$ python3 -m pip install -e .
```
**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it)
Then, you can run the program from anywhere in your disk as such:
```bash
$ python3 -m bdfr
```
## Style Guide
The BDFR must conform to PEP8 standard wherever there is Python code, with one exception. Line lengths may extend to 120 characters, but all other PEP8 standards must be followed.
It's easy to format your code without any manual work via a variety of tools. Autopep8 is a good one, and can be used with `autopep8 --max-line-length 120` which will format the code according to the style in use with the BDFR.
Hanging brackets are preferred when there are many items, items that otherwise go over the 120 character line limit, or when doing so would increase readability. It is also preferred when there might be many commits altering the list, such as with the parameter lists for tests. A hanging comma is also required in such cases. An example of this is below:
```python
test = [
'test 1',
'test 2',
'test 3',
]
```
Note that the last bracket is on its own line, and that the first bracket has a new line before the first term. Also note that there is a comma after the last term.
## Tests
### Running Tests
There are a lot of tests in the BDFR. In fact, there are more tests than lines of functional code. This is one of the strengths of the BDFR in that it is fully tested. The codebase uses the package pytest to create the tests, which is a third-party package that provides many functions and objects useful for testing Python code.
When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the request, it can be helpful (and much quicker) to run only a subset of the tests.
This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in use in the BDFR test suite.
- `slow`
- This marks a test that may take a long time to complete
- Usually marks a test that downloads many submissions or downloads a particularly large resource
- `online`
- This marks a test that requires an internet connection and uses online resources
- `reddit`
- This marks a test that accesses online Reddit specifically
- `authenticated`
- This marks a test that requires a test configuration file with a valid OAuth2 token
These tests can be run either all at once, or excluding certain marks. The tests that require online resources, such as those marked `reddit` or `online`, will naturally require more time to run than tests that are entirely offline. To run tests, you must be in the root directory of the project and can use the following command.
```bash
pytest
```
To exclude one or more marks, the following command can be used, substituting the unwanted mark.
```bash
pytest -m "not online"
pytest -m "not reddit and not authenticated"
```
For more details, review the pytest documentation that is freely available online.
Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities.
### Writing Tests
When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one.
If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with the `click` framework and are located in their own file.
It is also expected that new tests be classified correctly with the marks described above i.e. if a test accesses Reddit through a `reddit_instance` object, it must be given the `reddit` mark. If it requires an authenticated Reddit instance, then it must have the `authenticated` mark.

View File

@@ -1,35 +0,0 @@
# Interpret from source code
## Requirements
### 🐍 Python 3 Interpreter
- Python 3 is required. See if it is already installed, [here](#finding-the-correct-keyword-for-python).
- If not, download the matching release for your platform [here](https://www.python.org/downloads/) and install it. If you are a *Windows* user, selecting **Add Python 3 to PATH** option when installing the software is **mandatory**.
### 📃 Source Code
[Download the repository](https://github.com/aliparlakci/bulk-downloader-for-reddit/archive/master.zip) and extract the zip into a folder.
## 💻 Using the command line
Open the [Command Promt](https://youtu.be/bgSSJQolR0E?t=18), [Powershell](https://youtu.be/bgSSJQolR0E?t=18) or [Terminal](https://youtu.be/Pz4yHAB3G8w?t=31) in the folder that contains the script.py file (click on the links to see how)
### Finding the correct keyword for Python
Enter these lines to the terminal window until it prints out the a version starting with **`3.`**:
- `python --version`
- `python3 --version`
- `py --version`
- `py -3 --version`
Once it does, your keyword is without the `--version` part.
## 📦 Installing dependencies
Enter the line below to terminal window when you are in the directory where script.py is, use your keyword instead of `python`:
```console
python -m pip install -r requirements.txt
```
## 🏃‍♂️ Running the code
Type below code into command line inside the program folder, use your keyword instead of `python`:
```console
python script.py
```
The program should guide you through. **However**, you can also use custom options. See [Options](../README.md#⚙-Options)

7
pytest.ini Normal file
View File

@@ -0,0 +1,7 @@
[pytest]
markers =
online: tests require a connection to the internet
reddit: tests require a connection to Reddit
slow: test is slow to run
authenticated: test requires an authenticated Reddit instance

View File

@@ -1,4 +1,9 @@
bs4
requests
praw
youtube-dl
appdirs>=1.4.4
bs4>=0.0.1
click>=7.1.2
dict2xml>=1.7.0
ffmpeg-python>=0.2.0
praw>=7.2.0
pyyaml>=5.4.1
requests>=2.25.1
youtube-dl>=2021.3.14

379
script.py
View File

@@ -1,379 +0,0 @@
#!/usr/bin/env python3
"""
This program downloads imgur, gfycat and direct image and video links of
saved posts from a reddit account. It is written in Python 3.
"""
import logging
import os
import sys
import time
from io import StringIO
from pathlib import Path
from prawcore.exceptions import InsufficientScope
from src.downloaders.Direct import Direct
from src.downloaders.Erome import Erome
from src.downloaders.Gfycat import Gfycat
from src.downloaders.Imgur import Imgur
from src.downloaders.Gallery import Gallery
from src.downloaders.redgifs import Redgifs
from src.downloaders.selfPost import SelfPost
from src.downloaders.vreddit import VReddit
from src.downloaders.youtube import Youtube
from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork
from src.errors import ImgurLimitError, FileAlreadyExistsError, ImgurLoginError, NotADownloadableLinkError, NoSuitablePost, InvalidJSONFile, FailedToDownload, TypeInSkip, DomainInSkip, AlbumNotDownloadedCompletely, full_exc_info
from src.searcher import getPosts
from src.utils import (GLOBAL, createLogFile, nameCorrector,
printToFile)
from src.jsonHelper import JsonFile
from src.config import Config
from src.arguments import Arguments
from src.programMode import ProgramMode
from src.reddit import Reddit
from src.store import Store
from time import sleep
__author__ = "Ali Parlakci"
__license__ = "GPL"
__version__ = "1.10.0"
__maintainer__ = "Ali Parlakci"
__email__ = "parlakciali@gmail.com"
def postFromLog(fileName):
"""Analyze a log file and return a list of dictionaries containing
submissions
"""
if Path.is_file(Path(fileName)):
content = JsonFile(fileName).read()
else:
print("File not found")
sys.exit()
try:
del content["HEADER"]
except KeyError:
pass
posts = []
for post in content:
if content[post][-1]['TYPE'] is not None:
posts.append(content[post][-1])
return posts
def isPostExists(POST, directory):
"""Figure out a file's name and checks if the file already exists"""
filename = GLOBAL.config['filename'].format(**POST)
possibleExtensions = [".jpg", ".png", ".mp4",
".gif", ".webm", ".md", ".mkv", ".flv"]
for extension in possibleExtensions:
path = directory / Path(filename+extension)
if path.exists():
return True
return False
def downloadPost(SUBMISSION, directory):
downloaders = {
"imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":SelfPost,
"redgifs":Redgifs, "gifdeliverynetwork": GifDeliveryNetwork,
"v.redd.it": VReddit, "youtube": Youtube, "gallery": Gallery
}
print()
if SUBMISSION['TYPE'] in downloaders:
downloaders[SUBMISSION['TYPE']](directory, SUBMISSION)
else:
raise NoSuitablePost
def download(submissions):
"""Analyze list of submissions and call the right function
to download each one, catch errors, update the log files
"""
downloadedCount = 0
duplicates = 0
FAILED_FILE = createLogFile("FAILED")
if GLOBAL.arguments.unsave:
reddit = Reddit(GLOBAL.config['credentials']['reddit']).begin()
subsLenght = len(submissions)
for i in range(len(submissions)):
print(f"\n({i+1}/{subsLenght})", end="")
print(submissions[i]['POSTID'],
f"r/{submissions[i]['SUBREDDIT']}",
f"u/{submissions[i]['REDDITOR']}",
submissions[i]['FLAIR'] if submissions[i]['FLAIR'] else "",
sep="",
end="")
print(f" {submissions[i]['TYPE'].upper()}", end="", noPrint=True)
directory = GLOBAL.directory / \
GLOBAL.config["folderpath"].format(**submissions[i])
details = {
**submissions[i],
**{
"TITLE": nameCorrector(
submissions[i]['TITLE'],
reference=str(directory)
+ GLOBAL.config['filename'].format(**submissions[i])
+ ".ext"
)
}
}
filename = GLOBAL.config['filename'].format(**details)
if isPostExists(details, directory):
print()
print(directory)
print(filename)
print("It already exists")
duplicates += 1
continue
if any(domain in submissions[i]['CONTENTURL'] for domain in GLOBAL.arguments.skip):
print()
print(submissions[i]['CONTENTURL'])
print("Domain found in skip domains, skipping post...")
continue
try:
downloadPost(details, directory)
GLOBAL.downloadedPosts.add(details['POSTID'])
try:
if GLOBAL.arguments.unsave:
reddit.submission(id=details['POSTID']).unsave()
except InsufficientScope:
reddit = Reddit().begin()
reddit.submission(id=details['POSTID']).unsave()
if GLOBAL.arguments.download_delay:
print(f"Delaying next download for {GLOBAL.arguments.download_delay} seconds...")
sleep(GLOBAL.arguments.download_delay)
downloadedCount += 1
except FileAlreadyExistsError:
print("It already exists")
GLOBAL.downloadedPosts.add(details['POSTID'])
duplicates += 1
except ImgurLoginError:
print(
"Imgur login failed. \nQuitting the program "
"as unexpected errors might occur."
)
sys.exit()
except ImgurLimitError as exception:
FAILED_FILE.add({int(i+1): [
"{class_name}: {info}".format(
class_name=exception.__class__.__name__, info=str(
exception)
),
details
]})
except NotADownloadableLinkError as exception:
print(
"{class_name}: {info}".format(
class_name=exception.__class__.__name__, info=str(
exception)
)
)
FAILED_FILE.add({int(i+1): [
"{class_name}: {info}".format(
class_name=exception.__class__.__name__, info=str(
exception)
),
submissions[i]
]})
except TypeInSkip:
print()
print(submissions[i]['CONTENTURL'])
print("Skipping post...")
except DomainInSkip:
print()
print(submissions[i]['CONTENTURL'])
print("Skipping post...")
except NoSuitablePost:
print("No match found, skipping...")
except FailedToDownload:
print("Failed to download the posts, skipping...")
except AlbumNotDownloadedCompletely:
print("Album did not downloaded completely.")
FAILED_FILE.add({int(i+1): [
"{class_name}: {info}".format(
class_name=exc.__class__.__name__, info=str(exc)
),
submissions[i]
]})
except Exception as exc:
print(
"{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exc.__class__.__name__, info=str(exc)
)
)
logging.error(sys.exc_info()[0].__name__,
exc_info=full_exc_info(sys.exc_info()))
print(GLOBAL.log_stream.getvalue(), noPrint=True)
FAILED_FILE.add({int(i+1): [
"{class_name}: {info}".format(
class_name=exc.__class__.__name__, info=str(exc)
),
submissions[i]
]})
if duplicates:
print(f"\nThere {'were' if duplicates > 1 else 'was'} "
f"{duplicates} duplicate{'s' if duplicates > 1 else ''}")
if downloadedCount:
print(f"Total of {downloadedCount} "
f"link{'s' if downloadedCount > 1 else ''} downloaded!")
else:
print("Nothing is downloaded :(")
def printLogo():
VanillaPrint(
f"\nBulk Downloader for Reddit v{__version__}\n"
f"Written by Ali PARLAKCI parlakciali@gmail.com\n\n"
f"https://github.com/aliparlakci/bulk-downloader-for-reddit/\n"
)
def main():
if not Path(GLOBAL.defaultConfigDirectory).is_dir():
os.makedirs(GLOBAL.defaultConfigDirectory)
if Path("config.json").exists():
GLOBAL.configDirectory = Path("config.json")
else:
GLOBAL.configDirectory = GLOBAL.defaultConfigDirectory / "config.json"
try:
GLOBAL.config = Config(GLOBAL.configDirectory).generate()
except InvalidJSONFile as exception:
VanillaPrint(str(exception.__class__.__name__), ">>", str(exception))
VanillaPrint("Resolve it or remove it to proceed")
sys.exit()
sys.argv = sys.argv + GLOBAL.config["options"].split()
arguments = Arguments.parse()
GLOBAL.arguments = arguments
if arguments.set_filename:
Config(GLOBAL.configDirectory).setCustomFileName()
sys.exit()
if arguments.set_folderpath:
Config(GLOBAL.configDirectory).setCustomFolderPath()
sys.exit()
if arguments.set_default_directory:
Config(GLOBAL.configDirectory).setDefaultDirectory()
sys.exit()
if arguments.set_default_options:
Config(GLOBAL.configDirectory).setDefaultOptions()
sys.exit()
if arguments.use_local_config:
JsonFile("config.json").add(GLOBAL.config)
if arguments.directory:
GLOBAL.directory = Path(arguments.directory.strip())
elif "default_directory" in GLOBAL.config and GLOBAL.config["default_directory"] != "":
GLOBAL.directory = Path(
GLOBAL.config["default_directory"].format(time=GLOBAL.RUN_TIME))
else:
GLOBAL.directory = Path(input("\ndownload directory: ").strip())
if arguments.downloaded_posts:
GLOBAL.downloadedPosts = Store(arguments.downloaded_posts)
else:
GLOBAL.downloadedPosts = Store()
printLogo()
print("\n", " ".join(sys.argv), "\n", noPrint=True)
if arguments.log is not None:
logDir = Path(arguments.log)
download(postFromLog(logDir))
sys.exit()
programMode = ProgramMode(arguments).generate()
try:
posts = getPosts(programMode)
except Exception as exc:
logging.error(sys.exc_info()[0].__name__,
exc_info=full_exc_info(sys.exc_info()))
print(GLOBAL.log_stream.getvalue(), noPrint=True)
print(exc)
sys.exit()
if posts is None:
print("I could not find any posts in that URL")
sys.exit()
if GLOBAL.arguments.no_download:
pass
else:
download(posts)
if __name__ == "__main__":
GLOBAL.log_stream = StringIO()
logging.basicConfig(stream=GLOBAL.log_stream, level=logging.INFO)
try:
VanillaPrint = print
print = printToFile
GLOBAL.RUN_TIME = str(time.strftime(
"%d-%m-%Y_%H-%M-%S",
time.localtime(time.time())
))
main()
except KeyboardInterrupt:
if GLOBAL.directory is None:
GLOBAL.directory = Path("..\\")
except Exception as exception:
if GLOBAL.directory is None:
GLOBAL.directory = Path("..\\")
logging.error(sys.exc_info()[0].__name__,
exc_info=full_exc_info(sys.exc_info()))
print(GLOBAL.log_stream.getvalue())
if not GLOBAL.arguments.quit: input("\nPress enter to quit\n")

23
setup.cfg Normal file
View File

@@ -0,0 +1,23 @@
[metadata]
name = bdfr
description_file = README.md
description_content_type = text/markdown
home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit
keywords = reddit, download, archive
version = 2.0.3
author = Ali Parlakci
author_email = parlakciali@gmail.com
maintainer = Serene Arc
maintainer_email = serenical@gmail.com
license = GPLv3
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Natural Language :: English
Environment :: Console
Operating System :: OS Independent
requires_python = >=3.9
platforms = any
[files]
packages = bdfr

View File

@@ -1,50 +1,6 @@
#!C:\Users\Ali\AppData\Local\Programs\Python\Python36\python.exe
## python setup.py build
import sys
from cx_Freeze import setup, Executable
from script import __version__
options = {
"build_exe": {
"packages":[
"idna", "praw", "requests", "multiprocessing"
]
}
}
if sys.platform == "win32":
executables = [Executable(
"script.py",
targetName="bulk-downloader-for-reddit.exe",
shortcutName="Bulk Downloader for Reddit",
shortcutDir="DesktopFolder"
)]
elif sys.platform == "linux":
executables = [Executable(
"script.py",
targetName="bulk-downloader-for-reddit",
shortcutName="Bulk Downloader for Reddit",
shortcutDir="DesktopFolder"
)]
setup(
name = "Bulk Downloader for Reddit",
version = __version__,
description = "Bulk Downloader for Reddit",
author = "Ali Parlakci",
author_email="parlakciali@gmail.com",
url="https://github.com/aliparlakci/bulk-downloader-for-reddit",
classifiers=(
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)"
"Natural Language :: English",
"Environment :: Console",
"Operating System :: OS Independent",
),
executables = executables,
options = options
)
#!/usr/bin/env python3
# encoding=utf-8
from setuptools import setup
setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bdfr/default_config.cfg'])])

View File

@@ -1,176 +0,0 @@
import argparse
import sys
class Arguments:
@staticmethod
def parse(arguments=None):
"""Initialize argparse and add arguments"""
if arguments is None:
arguments = []
parser = argparse.ArgumentParser(allow_abbrev=False,
description="This program downloads "
"media from reddit "
"posts")
parser.add_argument("--directory", "-d",
help="Specifies the directory where posts will be "
"downloaded to",
metavar="DIRECTORY")
parser.add_argument("--verbose", "-v",
help="Verbose Mode",
action="store_true",
default=False)
parser.add_argument("--quit", "-q",
help="Auto quit afer the process finishes",
action="store_true",
default=False)
parser.add_argument("--link", "-l",
help="Get posts from link",
metavar="link")
parser.add_argument("--saved",
action="store_true",
required="--unsave" in sys.argv,
help="Triggers saved mode")
parser.add_argument("--unsave",
action="store_true",
help="Unsaves downloaded posts")
parser.add_argument("--submitted",
action="store_true",
help="Gets posts of --user")
parser.add_argument("--upvoted",
action="store_true",
help="Gets upvoted posts of --user")
parser.add_argument("--log",
help="Takes a log file which created by itself "
"(json files), reads posts and tries downloadin"
"g them again.",
# type=argparse.FileType('r'),
metavar="LOG FILE")
parser.add_argument(
"--subreddit",
nargs="+",
help="Triggers subreddit mode and takes subreddit's "
"name without r/. use \"frontpage\" for frontpage",
metavar="SUBREDDIT",
type=str)
parser.add_argument("--multireddit",
help="Triggers multireddit mode and takes "
"multireddit's name without m/",
metavar="MULTIREDDIT",
type=str)
parser.add_argument("--user",
help="reddit username if needed. use \"me\" for "
"current user",
required="--multireddit" in sys.argv or
"--submitted" in sys.argv,
metavar="redditor",
type=str)
parser.add_argument(
"--search",
help="Searches for given query in given subreddits",
metavar="query",
type=str)
parser.add_argument("--sort",
help="Either hot, top, new, controversial, rising "
"or relevance default: hot",
choices=[
"hot", "top", "new", "controversial", "rising",
"relevance"
],
metavar="SORT TYPE",
type=str)
parser.add_argument("--limit",
help="default: unlimited",
metavar="Limit",
type=int)
parser.add_argument("--time",
help="Either hour, day, week, month, year or all."
" default: all",
choices=["all", "hour", "day",
"week", "month", "year"],
metavar="TIME_LIMIT",
type=str)
parser.add_argument("--skip",
nargs="+",
help="Skip posts with given type",
type=str,
choices=["images", "videos", "gifs", "self"],
default=[])
parser.add_argument("--skip-domain",
nargs="+",
help="Skip posts with given domain",
type=str,
default=[])
parser.add_argument("--set-folderpath",
action="store_true",
help="Set custom folderpath"
)
parser.add_argument("--set-filename",
action="store_true",
help="Set custom filename",
)
parser.add_argument(
"--set-default-directory",
action="store_true",
help="Set a default directory to be used in case no directory is given",
)
parser.add_argument(
"--set-default-options",
action="store_true",
help="Set default options to use everytime program runs",
)
parser.add_argument(
"--use-local-config",
action="store_true",
help="Creates a config file in the program's directory and uses it. Useful for having multiple configs",
)
parser.add_argument(
"--no-dupes",
action="store_true",
help="Do not download duplicate posts on different subreddits",
)
parser.add_argument(
"--downloaded-posts",
help="Use a hash file to keep track of downloaded files",
type=str)
parser.add_argument(
"--no-download",
action="store_true",
help="Just saved posts into a the POSTS.json file without downloading")
parser.add_argument(
"--download-delay",
metavar="DELAY",
type=int,
help="Amount, in seconds, to delay before beginning the next item in the download queue")
if arguments == []:
return parser.parse_args()
return parser.parse_args(arguments)

View File

@@ -1,130 +0,0 @@
from src.reddit import Reddit
from src.jsonHelper import JsonFile
from src.utils import nameCorrector
class Config():
def __init__(self, filename):
self.filename = filename
self.file = JsonFile(self.filename)
def generate(self):
self._validateCredentials()
self._readCustomFileName()
self._readCustomFolderPath()
self._readDefaultOptions()
return self.file.read()
def setCustomFileName(self):
print("""
IMPORTANT: Do not change the filename structure frequently.
If you did, the program could not find duplicates and
would download the already downloaded files again.
This would not create any duplicates in the directory but
the program would not be as snappy as it should be.
Type a template file name for each post.
You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces
The text in curly braces will be replaced with the corresponding property of an each post
For example: {FLAIR}_{SUBREDDIT}_{REDDITOR}
Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"])
filename = nameCorrector(input(">> ").upper())
self.file.add({
"filename": filename
})
def _readCustomFileName(self):
content = self.file.read()
if "filename" not in content:
self.file.add({
"filename": "{REDDITOR}_{TITLE}_{POSTID}"
})
content = self.file.read()
if not "{POSTID}" in content["filename"]:
self.file.add({
"filename": content["filename"] + "_{POSTID}"
})
def setCustomFolderPath(self):
print("""
Type a folder structure (generic folder path)
Use slash or DOUBLE backslash to separate folders
You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces
The text in curly braces will be replaced with the corresponding property of an each post
For example: {REDDITOR}/{SUBREDDIT}/{FLAIR}
Existing folder structure""", None if "folderpath" not in self.file.read() else self.file.read()["folderpath"])
folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper())
self.file.add({
"folderpath": folderpath
})
def _readCustomFolderPath(self, path=None):
content = self.file.read()
if "folderpath" not in content:
self.file.add({
"folderpath": "{SUBREDDIT}"
})
def setDefaultOptions(self):
print("""
Type options to be used everytime script runs
For example: --no-dupes --quit --limit 100 --skip youtube.com
Existing default options:""", None if "options" not in self.file.read() else self.file.read()["options"])
options = input(">> ").strip("")
self.file.add({
"options": options
})
def _readDefaultOptions(self, path=None):
content = self.file.read()
if "options" not in content:
self.file.add({
"options": ""
})
def _validateCredentials(self):
"""Read credentials from config.json file"""
try:
content = self.file.read()["credentials"]
except BaseException:
self.file.add({
"credentials": {}
})
content = self.file.read()["credentials"]
if "reddit" in content and len(content["reddit"]) != 0:
pass
else:
Reddit().begin()
print()
def setDefaultDirectory(self):
print("""Set a default directory to use in case no directory is given
Leave blank to reset it. You can use {time} in foler names to use to timestamp it
For example: D:/archive/BDFR_{time}
""")
print("Current default directory:", self.file.read()[
"default_directory"] if "default_directory" in self.file.read() else "")
self.file.add({
"default_directory": input(">> ")
})

View File

@@ -1,16 +0,0 @@
import os
from src.downloaders.downloaderUtils import getFile, getExtension
from src.utils import GLOBAL
class Direct:
def __init__(self, directory, POST):
POST['EXTENSION'] = getExtension(POST['CONTENTURL'])
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"]
shortFilename = POST['POSTID'] + POST['EXTENSION']
getFile(filename, shortFilename, directory, POST['CONTENTURL'])

View File

@@ -1,136 +0,0 @@
import os
import urllib.request
from html.parser import HTMLParser
from src.downloaders.downloaderUtils import getFile
from src.downloaders.downloaderUtils import getExtension
from src.errors import (AlbumNotDownloadedCompletely,
NotADownloadableLinkError, FileAlreadyExistsError)
from src.utils import GLOBAL
from src.utils import printToFile as print
class Erome:
def __init__(self, directory, post):
try:
IMAGES = self.getLinks(post['CONTENTURL'])
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
imagesLenght = len(IMAGES)
howManyDownloaded = imagesLenght
duplicates = 0
if imagesLenght == 1:
extension = getExtension(IMAGES[0])
"""Filenames are declared here"""
filename = GLOBAL.config['filename'].format(
**post) + post["EXTENSION"]
shortFilename = post['POSTID'] + extension
imageURL = IMAGES[0]
if 'https://' not in imageURL or 'http://' not in imageURL:
imageURL = "https://" + imageURL
getFile(filename, shortFilename, directory, imageURL)
else:
filename = GLOBAL.config['filename'].format(**post)
print(filename)
folderDir = directory / filename
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['POSTID']
os.makedirs(folderDir)
for i in range(imagesLenght):
extension = getExtension(IMAGES[i])
filename = str(i + 1) + extension
imageURL = IMAGES[i]
if 'https://' not in imageURL and 'http://' not in imageURL:
imageURL = "https://" + imageURL
print(" ({}/{})".format(i + 1, imagesLenght))
print(" {}".format(filename))
try:
getFile(filename, filename, folderDir, imageURL, indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
howManyDownloaded -= 1
except Exception as exception:
# raise exception
print("\n Could not get the file")
print(
" "
+ "{class_name}: {info}".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
if howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def getLinks(self, url, lineNumber=129):
content = []
lineNumber = None
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(pageSource)):
obj = EromeParser()
obj.feed(pageSource[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
lineNumber = i
break
for line in pageSource[lineNumber:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"] == "img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [
link for link in content
if link.endswith("_480p.mp4") or not link.endswith(".mp4")
]

View File

@@ -1,115 +0,0 @@
import os
import json
import urllib
import requests
from src.utils import GLOBAL
from src.utils import printToFile as print
from src.downloaders.downloaderUtils import getFile
from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, NotADownloadableLinkError, TypeInSkip
class Gallery:
def __init__(self, directory, post):
links = post['CONTENTURL']
images = {}
count = 0
for link in links:
path = urllib.parse.urlparse(link).path
base = os.path.basename(path)
name = os.path.splitext(base)[0]
images[count] = {'id': name, 'url': link}
count = count + 1
self.directory = directory
self.post = post
self.downloadAlbum(images, count)
@staticmethod
def getData(link):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
res = requests.get(link, headers=headers)
if res.status_code != 200:
raise ImageNotFound(
f"Server responded with {res.status_code} to {link}")
pageSource = res.text
STARTING_STRING = "_r = {"
ENDING_STRING = "</script>"
STARTING_STRING_LENGHT = len(STARTING_STRING)
try:
startIndex = pageSource.index(
STARTING_STRING) + STARTING_STRING_LENGHT
endIndex = pageSource.index(ENDING_STRING, startIndex)
except ValueError:
raise NotADownloadableLinkError(
f"Could not read the page source on {link}")
data = json.loads(pageSource[startIndex - 1:endIndex + 1].strip()[:-1])
return data
def downloadAlbum(self, images, count):
folderName = GLOBAL.config['filename'].format(**self.post)
folderDir = self.directory / folderName
howManyDownloaded = 0
duplicates = 0
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = self.directory / self.post['POSTID']
os.makedirs(folderDir)
print(folderName)
for i in range(count):
path = urllib.parse.urlparse(images[i]['url']).path
extension = os.path.splitext(path)[1]
filename = "_".join([
str(i + 1), images[i]['id']
]) + extension
shortFilename = str(i + 1) + "_" + images[i]['id']
print("\n ({}/{})".format(i + 1, count))
try:
getFile(filename, shortFilename, folderDir,
images[i]['url'], indent=2)
howManyDownloaded += 1
print()
except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
print(" Skipping...")
howManyDownloaded += 1
except Exception as exception:
print("\n Could not get the file")
print(
" " +
"{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__,
info=str(exception)) +
"\n")
print(GLOBAL.log_stream.getvalue(), noPrint=True)
if duplicates == count:
raise FileAlreadyExistsError
if howManyDownloaded + duplicates < count:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)

View File

@@ -1,53 +0,0 @@
import json
import os
import urllib.request
from bs4 import BeautifulSoup
from src.downloaders.downloaderUtils import getFile, getExtension
from src.errors import (NotADownloadableLinkError)
from src.utils import GLOBAL
from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork
class Gfycat:
def __init__(self, directory, POST):
try:
POST['MEDIAURL'] = self.getLink(POST['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
POST['EXTENSION'] = getExtension(POST['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"]
shortFilename = POST['POSTID'] + POST['EXTENSION']
getFile(filename, shortFilename, directory, POST['MEDIAURL'])
@staticmethod
def getLink(url):
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url
if url[-1:] == '/':
url = url[:-1]
url = "https://gfycat.com/" + url.split('/')[-1]
pageSource = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(pageSource, "html.parser")
attributes = {"data-react-helmet": "true",
"type": "application/ld+json"}
content = soup.find("script", attrs=attributes)
if content is None:
return GifDeliveryNetwork.getLink(url)
return json.loads(content.contents[0])["video"]["contentUrl"]

View File

@@ -1,153 +0,0 @@
import json
import os
import requests
from src.utils import GLOBAL, nameCorrector
from src.utils import printToFile as print
from src.downloaders.Direct import Direct
from src.downloaders.downloaderUtils import getFile
from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, ExtensionError, NotADownloadableLinkError, TypeInSkip
class Imgur:
IMGUR_IMAGE_DOMAIN = "https://i.imgur.com/"
def __init__(self, directory, post):
link = post['CONTENTURL']
if link.endswith(".gifv"):
link = link.replace(".gifv", ".mp4")
Direct(directory, {**post, 'CONTENTURL': link})
return None
self.rawData = self.getData(link)
self.directory = directory
self.post = post
if self.isAlbum:
if self.rawData["album_images"]["count"] != 1:
self.downloadAlbum(self.rawData["album_images"])
else:
self.download(self.rawData["album_images"]["images"][0])
else:
self.download(self.rawData)
def downloadAlbum(self, images):
folderName = GLOBAL.config['filename'].format(**self.post)
folderDir = self.directory / folderName
imagesLenght = images["count"]
howManyDownloaded = 0
duplicates = 0
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = self.directory / self.post['POSTID']
os.makedirs(folderDir)
print(folderName)
for i in range(imagesLenght):
extension = self.validateExtension(images["images"][i]["ext"])
imageURL = self.IMGUR_IMAGE_DOMAIN + \
images["images"][i]["hash"] + extension
filename = "_".join([str(i + 1),
nameCorrector(images["images"][i]['title']),
images["images"][i]['hash']]) + extension
shortFilename = str(i + 1) + "_" + images["images"][i]['hash']
print("\n ({}/{})".format(i + 1, imagesLenght))
try:
getFile(filename, shortFilename, folderDir, imageURL, indent=2)
howManyDownloaded += 1
print()
except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
print(" Skipping...")
howManyDownloaded += 1
except Exception as exception:
print("\n Could not get the file")
print(
" " +
"{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__,
info=str(exception)) +
"\n")
print(GLOBAL.log_stream.getvalue(), noPrint=True)
if duplicates == imagesLenght:
raise FileAlreadyExistsError
if howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def download(self, image):
extension = self.validateExtension(image["ext"])
imageURL = self.IMGUR_IMAGE_DOMAIN + image["hash"] + extension
filename = GLOBAL.config['filename'].format(**self.post) + extension
shortFilename = self.post['POSTID'] + extension
getFile(filename, shortFilename, self.directory, imageURL)
@property
def isAlbum(self):
return "album_images" in self.rawData
@staticmethod
def getData(link):
cookies = {"over18": "1", "postpagebeta": "0"}
res = requests.get(link, cookies=cookies)
if res.status_code != 200:
raise ImageNotFound(
f"Server responded with {res.status_code} to {link}")
pageSource = requests.get(link, cookies=cookies).text
STARTING_STRING = "image : "
ENDING_STRING = "group :"
STARTING_STRING_LENGHT = len(STARTING_STRING)
try:
startIndex = pageSource.index(
STARTING_STRING) + STARTING_STRING_LENGHT
endIndex = pageSource.index(ENDING_STRING, startIndex)
except ValueError:
raise NotADownloadableLinkError(
f"Could not read the page source on {link}")
while pageSource[endIndex] != "}":
endIndex = endIndex - 1
try:
data = pageSource[startIndex:endIndex + 2].strip()[:-1]
except BaseException:
pageSource[endIndex + 1] = '}'
data = pageSource[startIndex:endIndex + 3].strip()[:-1]
return json.loads(data)
@staticmethod
def validateExtension(string):
POSSIBLE_EXTENSIONS = [".jpg", ".png", ".mp4", ".gif"]
for extension in POSSIBLE_EXTENSIONS:
if extension in string:
return extension
raise ExtensionError(
f"\"{string}\" is not recognized as a valid extension.")

View File

@@ -1,121 +0,0 @@
import sys
import os
import urllib.request
from pathlib import Path
import hashlib
from src.utils import GLOBAL
from src.utils import printToFile as print
from src.errors import FileAlreadyExistsError, FailedToDownload, TypeInSkip, DomainInSkip
def dlProgress(count, blockSize, totalSize):
"""Function for writing download progress to console
"""
downloadedMbs = int(count * blockSize * (10**(-6)))
fileSize = int(totalSize * (10**(-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs, fileSize))
sys.stdout.flush()
def getExtension(link):
"""Extract file extension from image link.
If didn't find any, return '.jpg'
"""
imageTypes = ['jpg', 'png', 'mp4', 'webm', 'gif']
parsed = link.split('.')
for fileType in imageTypes:
if fileType in parsed:
return "." + parsed[-1]
if "v.redd.it" not in link:
return '.jpg'
return '.mp4'
def getFile(
filename,
shortFilename,
folderDir,
imageURL,
indent=0,
silent=False):
FORMATS = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
}
for type in GLOBAL.arguments.skip:
for extension in FORMATS[type]:
if extension in filename:
raise TypeInSkip
if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
if not os.path.exists(folderDir):
os.makedirs(folderDir)
opener = urllib.request.build_opener()
if "imgur" not in imageURL:
opener.addheaders = headers
urllib.request.install_opener(opener)
if not silent:
print(" " * indent + str(folderDir),
" " * indent + str(filename),
sep="\n")
for i in range(3):
fileDir = Path(folderDir) / filename
tempDir = Path(folderDir) / (filename + ".tmp")
if not (os.path.isfile(fileDir)):
try:
urllib.request.urlretrieve(imageURL,
tempDir,
reporthook=dlProgress)
fileHash = createHash(tempDir)
if GLOBAL.arguments.no_dupes:
if fileHash in GLOBAL.downloadedPosts():
os.remove(tempDir)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(fileHash)
os.rename(tempDir, fileDir)
if not silent:
print(" " * indent + "Downloaded" + " " * 10)
return None
except ConnectionResetError:
raise FailedToDownload
except FileNotFoundError:
filename = shortFilename
else:
raise FileAlreadyExistsError
raise FailedToDownload
def createHash(filename):
hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()

View File

@@ -1,52 +0,0 @@
import os
import urllib.request
from bs4 import BeautifulSoup
from src.downloaders.downloaderUtils import getFile, getExtension
from src.errors import (NotADownloadableLinkError)
from src.utils import GLOBAL
class GifDeliveryNetwork:
def __init__(self, directory, POST):
try:
POST['MEDIAURL'] = self.getLink(POST['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
POST['EXTENSION'] = getExtension(POST['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"]
shortFilename = POST['POSTID'] + POST['EXTENSION']
getFile(filename, shortFilename, directory, POST['MEDIAURL'])
@staticmethod
def getLink(url):
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url.split(
'/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
return url
if url[-1:] == '/':
url = url[:-1]
url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1]
pageSource = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(pageSource, "html.parser")
attributes = {"id": "mp4Source", "type": "video/mp4"}
content = soup.find("source", attrs=attributes)
if content is None:
raise NotADownloadableLinkError("Could not read the page source")
return content["src"]

View File

@@ -1,57 +0,0 @@
import json
import os
import urllib.request
from bs4 import BeautifulSoup
from src.downloaders.downloaderUtils import getFile, getExtension
from src.errors import (NotADownloadableLinkError)
from src.utils import GLOBAL
class Redgifs:
def __init__(self, directory, POST):
try:
POST['MEDIAURL'] = self.getLink(POST['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
POST['EXTENSION'] = getExtension(POST['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"]
shortFilename = POST['POSTID'] + POST['EXTENSION']
getFile(filename, shortFilename, directory, POST['MEDIAURL'])
@staticmethod
def getLink(url):
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url
if url[-1:] == '/':
url = url[:-1]
url = urllib.request.Request(
"https://redgifs.com/watch/" + url.split('/')[-1])
url.add_header(
'User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
pageSource = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(pageSource, "html.parser")
attributes = {"data-react-helmet": "true",
"type": "application/ld+json"}
content = soup.find("script", attrs=attributes)
if content is None:
raise NotADownloadableLinkError("Could not read the page source")
return json.loads(content.contents[0])["video"]["contentUrl"]

View File

@@ -1,61 +0,0 @@
from src.utils import printToFile as print
import io
import os
from pathlib import Path
from src.errors import FileAlreadyExistsError, TypeInSkip
from src.utils import GLOBAL
VanillaPrint = print
class SelfPost:
def __init__(self, directory, post):
if "self" in GLOBAL.arguments.skip:
raise TypeInSkip
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post)
fileDir = directory / (filename + ".md")
print(fileDir)
print(filename + ".md")
if Path.is_file(fileDir):
raise FileAlreadyExistsError
try:
self.writeToFile(fileDir, post)
except FileNotFoundError:
fileDir = post['POSTID'] + ".md"
fileDir = directory / fileDir
self.writeToFile(fileDir, post)
@staticmethod
def writeToFile(directory, post):
"""Self posts are formatted here"""
content = ("## ["
+ post["TITLE"]
+ "]("
+ post["CONTENTURL"]
+ ")\n"
+ post["CONTENT"]
+ "\n\n---\n\n"
+ "submitted to [r/"
+ post["SUBREDDIT"]
+ "](https://www.reddit.com/r/"
+ post["SUBREDDIT"]
+ ") by [u/"
+ post["REDDITOR"]
+ "](https://www.reddit.com/user/"
+ post["REDDITOR"]
+ ")")
with io.open(directory, "w", encoding="utf-8") as FILE:
VanillaPrint(content, file=FILE)
print("Downloaded")

View File

@@ -1,57 +0,0 @@
import os
import subprocess
from src.downloaders.downloaderUtils import getFile
from src.utils import GLOBAL
from src.utils import printToFile as print
class VReddit:
def __init__(self, directory, post):
extension = ".mp4"
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + extension
shortFilename = post['POSTID'] + extension
try:
FNULL = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=FNULL, stderr=subprocess.STDOUT)
except BaseException:
getFile(filename, shortFilename, directory, post['CONTENTURL'])
print("FFMPEG library not found, skipping merging video and audio")
else:
videoName = post['POSTID'] + "_video"
videoURL = post['CONTENTURL']
audioName = post['POSTID'] + "_audio"
audioURL = videoURL[:videoURL.rfind('/')] + '/DASH_audio.mp4'
print(directory, filename, sep="\n")
getFile(videoName, videoName, directory, videoURL, silent=True)
getFile(audioName, audioName, directory, audioURL, silent=True)
try:
self._mergeAudio(videoName,
audioName,
filename,
shortFilename,
directory)
except KeyboardInterrupt:
os.remove(directory / filename)
os.remove(directory / audioName)
os.rename(directory / videoName, directory / filename)
@staticmethod
def _mergeAudio(video, audio, filename, shortFilename, directory):
inputVideo = str(directory / video)
inputAudio = str(directory / audio)
FNULL = open(os.devnull, 'w')
cmd = f"ffmpeg -i {inputAudio} -i {inputVideo} -c:v copy -c:a aac -strict experimental {str(directory / filename)}"
subprocess.call(cmd.split(), stdout=FNULL, stderr=subprocess.STDOUT)
os.remove(directory / video)
os.remove(directory / audio)

View File

@@ -1,53 +0,0 @@
import os
import youtube_dl
import sys
from src.downloaders.downloaderUtils import createHash
from src.utils import GLOBAL
from src.utils import printToFile as print
from src.errors import FileAlreadyExistsError
class Youtube:
def __init__(self, directory, post):
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post)
print(filename)
self.download(filename, directory, post['CONTENTURL'])
def download(self, filename, directory, url):
ydl_opts = {
"format": "best",
"outtmpl": str(directory / (filename + ".%(ext)s")),
"progress_hooks": [self._hook],
"playlistend": 1,
"nooverwrites": True,
"quiet": True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
location = directory / (filename + ".mp4")
if GLOBAL.arguments.no_dupes:
try:
fileHash = createHash(location)
except FileNotFoundError:
return None
if fileHash in GLOBAL.downloadedPosts():
os.remove(location)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(fileHash)
@staticmethod
def _hook(d):
if d['status'] == 'finished':
return print("Downloaded")
downloadedMbs = int(d['downloaded_bytes'] * (10**(-6)))
fileSize = int(d['total_bytes'] * (10**(-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs, fileSize))
sys.stdout.flush()

View File

@@ -1,140 +0,0 @@
import sys
def full_exc_info(exc_info):
def current_stack(skip=0):
try:
1 / 0
except ZeroDivisionError:
f = sys.exc_info()[2].tb_frame
for i in range(skip + 2):
f = f.f_back
lst = []
while f is not None:
lst.append((f, f.f_lineno))
f = f.f_back
return lst
def extend_traceback(tb, stack):
class FauxTb():
def __init__(self, tb_frame, tb_lineno, tb_next):
self.tb_frame = tb_frame
self.tb_lineno = tb_lineno
self.tb_next = tb_next
"""Extend traceback with stack info."""
head = tb
for tb_frame, tb_lineno in stack:
head = FauxTb(tb_frame, tb_lineno, head)
return head
"""Like sys.exc_info, but includes the full traceback."""
t, v, tb = exc_info
full_tb = extend_traceback(tb, current_stack(1))
return t, v, full_tb
class RedditLoginFailed(Exception):
pass
class ImgurLoginError(Exception):
pass
class FileAlreadyExistsError(Exception):
pass
class NotADownloadableLinkError(Exception):
pass
class AlbumNotDownloadedCompletely(Exception):
pass
class FileNameTooLong(Exception):
pass
class InvalidRedditLink(Exception):
pass
class ProgramModeError(Exception):
pass
class SearchModeError(Exception):
pass
class RedditorNameError(Exception):
pass
class NoMatchingSubmissionFound(Exception):
pass
class NoPrawSupport(Exception):
pass
class NoRedditSupport(Exception):
pass
class MultiredditNotFound(Exception):
pass
class InsufficientPermission(Exception):
pass
class InvalidSortingType(Exception):
pass
class FileNotFoundError(Exception):
pass
class NoSuitablePost(Exception):
pass
class ImgurLimitError(Exception):
pass
class DirectLinkNotFound(Exception):
pass
class InvalidJSONFile(Exception):
pass
class FailedToDownload(Exception):
pass
class TypeInSkip(Exception):
pass
class DomainInSkip(Exception):
pass
class ImageNotFound(Exception):
pass
class ExtensionError(Exception):
pass

View File

@@ -1,61 +0,0 @@
import json
from os import path, remove
from src.errors import InvalidJSONFile
class JsonFile:
""" Write and read JSON files
Use add(self,toBeAdded) to add to files
Use delete(self,*deletedKeys) to delete keys
"""
FILEDIR = ""
def __init__(self, FILEDIR):
self.FILEDIR = FILEDIR
if not path.exists(self.FILEDIR):
self.__writeToFile({}, create=True)
def read(self):
try:
with open(self.FILEDIR, 'r') as f:
return json.load(f)
except json.decoder.JSONDecodeError:
raise InvalidJSONFile(f"{self.FILEDIR} cannot be read")
def add(self, toBeAdded, sub=None):
"""Takes a dictionary and merges it with json file.
It uses new key's value if a key already exists.
Returns the new content as a dictionary.
"""
data = self.read()
if sub:
data[sub] = {**data[sub], **toBeAdded}
else:
data = {**data, **toBeAdded}
self.__writeToFile(data)
return self.read()
def delete(self, *deleteKeys):
"""Delete given keys from JSON file.
Returns the new content as a dictionary.
"""
data = self.read()
for deleteKey in deleteKeys:
if deleteKey in data:
del data[deleteKey]
found = True
if not found:
return False
self.__writeToFile(data)
def __writeToFile(self, content, create=False):
if not create:
remove(self.FILEDIR)
with open(self.FILEDIR, 'w') as f:
json.dump(content, f, indent=4)

View File

@@ -1,243 +0,0 @@
from pprint import pprint
try:
from src.errors import InvalidRedditLink
except ModuleNotFoundError:
from errors import InvalidRedditLink
def QueryParser(PassedQueries, index):
ExtractedQueries = {}
QuestionMarkIndex = PassedQueries.index("?")
Header = PassedQueries[:QuestionMarkIndex]
ExtractedQueries["HEADER"] = Header
Queries = PassedQueries[QuestionMarkIndex + 1:]
ParsedQueries = Queries.split("&")
for Query in ParsedQueries:
Query = Query.split("=")
ExtractedQueries[Query[0]] = Query[1]
if ExtractedQueries["HEADER"] == "search":
ExtractedQueries["q"] = ExtractedQueries["q"].replace("%20", " ")
return ExtractedQueries
def LinkParser(LINK):
RESULT = {}
ShortLink = False
if "reddit.com" not in LINK:
raise InvalidRedditLink("Invalid reddit link")
SplittedLink = LINK.split("/")
if SplittedLink[0] == "https:" or SplittedLink[0] == "http:":
SplittedLink = SplittedLink[2:]
try:
if (SplittedLink[-2].endswith("reddit.com") and
SplittedLink[-1] == "") or \
SplittedLink[-1].endswith("reddit.com"):
RESULT["sort"] = "best"
return RESULT
except IndexError:
if SplittedLink[0].endswith("reddit.com"):
RESULT["sort"] = "best"
return RESULT
if "redd.it" in SplittedLink:
ShortLink = True
if SplittedLink[0].endswith("reddit.com"):
SplittedLink = SplittedLink[1:]
if "comments" in SplittedLink:
RESULT = {"post": LINK}
return RESULT
if "me" in SplittedLink or \
"u" in SplittedLink or \
"user" in SplittedLink or \
"r" in SplittedLink or \
"m" in SplittedLink:
if "r" in SplittedLink:
RESULT["subreddit"] = SplittedLink[SplittedLink.index("r") + 1]
elif "m" in SplittedLink:
RESULT["multireddit"] = SplittedLink[SplittedLink.index("m") + 1]
RESULT["user"] = SplittedLink[SplittedLink.index("m") - 1]
else:
for index in range(len(SplittedLink)):
if SplittedLink[index] == "u" or \
SplittedLink[index] == "user":
RESULT["user"] = SplittedLink[index + 1]
elif SplittedLink[index] == "me":
RESULT["user"] = "me"
for index in range(len(SplittedLink)):
if SplittedLink[index] in [
"hot", "top", "new", "controversial", "rising"
]:
RESULT["sort"] = SplittedLink[index]
if index == 0:
RESULT["subreddit"] = "frontpage"
elif SplittedLink[index] in ["submitted", "saved", "posts", "upvoted"]:
if SplittedLink[index] == "submitted" or \
SplittedLink[index] == "posts":
RESULT["submitted"] = {}
elif SplittedLink[index] == "saved":
RESULT["saved"] = True
elif SplittedLink[index] == "upvoted":
RESULT["upvoted"] = True
elif "?" in SplittedLink[index]:
ParsedQuery = QueryParser(SplittedLink[index], index)
if ParsedQuery["HEADER"] == "search":
del ParsedQuery["HEADER"]
RESULT["search"] = ParsedQuery
elif ParsedQuery["HEADER"] == "submitted" or \
ParsedQuery["HEADER"] == "posts":
del ParsedQuery["HEADER"]
RESULT["submitted"] = ParsedQuery
else:
del ParsedQuery["HEADER"]
RESULT["queries"] = ParsedQuery
if not ("upvoted" in RESULT or
"saved" in RESULT or
"submitted" in RESULT or
"multireddit" in RESULT) and \
"user" in RESULT:
RESULT["submitted"] = {}
return RESULT
def LinkDesigner(LINK):
attributes = LinkParser(LINK)
MODE = {}
if "post" in attributes:
MODE["post"] = attributes["post"]
MODE["sort"] = ""
MODE["time"] = ""
return MODE
if "search" in attributes:
MODE["search"] = attributes["search"]["q"]
if "restrict_sr" in attributes["search"]:
if not (attributes["search"]["restrict_sr"] == 0 or
attributes["search"]["restrict_sr"] == "off" or
attributes["search"]["restrict_sr"] == ""):
if "subreddit" in attributes:
MODE["subreddit"] = attributes["subreddit"]
elif "multireddit" in attributes:
MODE["multreddit"] = attributes["multireddit"]
MODE["user"] = attributes["user"]
else:
MODE["subreddit"] = "all"
else:
MODE["subreddit"] = "all"
if "t" in attributes["search"]:
MODE["time"] = attributes["search"]["t"]
else:
MODE["time"] = "all"
if "sort" in attributes["search"]:
MODE["sort"] = attributes["search"]["sort"]
else:
MODE["sort"] = "relevance"
if "include_over_18" in attributes["search"]:
if attributes["search"]["include_over_18"] == 1 or \
attributes["search"]["include_over_18"] == "on":
MODE["nsfw"] = True
else:
MODE["nsfw"] = False
else:
if "queries" in attributes:
if not ("submitted" in attributes or
"posts" in attributes):
if "t" in attributes["queries"]:
MODE["time"] = attributes["queries"]["t"]
else:
MODE["time"] = "day"
else:
if "t" in attributes["queries"]:
MODE["time"] = attributes["queries"]["t"]
else:
MODE["time"] = "all"
if "sort" in attributes["queries"]:
MODE["sort"] = attributes["queries"]["sort"]
else:
MODE["sort"] = "new"
else:
MODE["time"] = "day"
if "subreddit" in attributes and "search" not in attributes:
MODE["subreddit"] = attributes["subreddit"]
elif "user" in attributes and "search" not in attributes:
MODE["user"] = attributes["user"]
if "submitted" in attributes:
MODE["submitted"] = True
if "sort" in attributes["submitted"]:
MODE["sort"] = attributes["submitted"]["sort"]
elif "sort" in MODE:
pass
else:
MODE["sort"] = "new"
if "t" in attributes["submitted"]:
MODE["time"] = attributes["submitted"]["t"]
else:
MODE["time"] = "all"
elif "saved" in attributes:
MODE["saved"] = True
elif "upvoted" in attributes:
MODE["upvoted"] = True
elif "multireddit" in attributes:
MODE["multireddit"] = attributes["multireddit"]
if "sort" in attributes:
MODE["sort"] = attributes["sort"]
elif "sort" in MODE:
pass
else:
MODE["sort"] = "hot"
return MODE
if __name__ == "__main__":
while True:
link = input("> ")
pprint(LinkDesigner(link))

View File

@@ -1,277 +0,0 @@
from src.errors import SearchModeError, RedditorNameError, ProgramModeError, InvalidSortingType
from src.parser import LinkDesigner
from pathlib import Path
import sys
class ProgramMode:
def __init__(self, arguments):
self.arguments = arguments
def generate(self):
try:
self._validateProgramMode()
except ProgramModeError:
self._promptUser()
programMode = {}
if self.arguments.user is not None:
programMode["user"] = self.arguments.user
if self.arguments.search is not None:
programMode["search"] = self.arguments.search
if self.arguments.sort == "hot" or \
self.arguments.sort == "controversial" or \
self.arguments.sort == "rising":
self.arguments.sort = "relevance"
if self.arguments.sort is not None:
programMode["sort"] = self.arguments.sort
else:
if self.arguments.submitted:
programMode["sort"] = "new"
else:
programMode["sort"] = "hot"
if self.arguments.time is not None:
programMode["time"] = self.arguments.time
else:
programMode["time"] = "all"
if self.arguments.link is not None:
self.arguments.link = self.arguments.link.strip("\"")
programMode = LinkDesigner(self.arguments.link)
if self.arguments.search is not None:
programMode["search"] = self.arguments.search
if self.arguments.sort is not None:
programMode["sort"] = self.arguments.sort
if self.arguments.time is not None:
programMode["time"] = self.arguments.time
elif self.arguments.subreddit is not None:
if isinstance(self.arguments.subreddit, list):
self.arguments.subreddit = "+".join(self.arguments.subreddit)
programMode["subreddit"] = self.arguments.subreddit
elif self.arguments.multireddit is not None:
programMode["multireddit"] = self.arguments.multireddit
elif self.arguments.saved is True:
programMode["saved"] = True
elif self.arguments.upvoted is True:
programMode["upvoted"] = True
elif self.arguments.submitted is not None:
programMode["submitted"] = True
if self.arguments.sort == "rising":
raise InvalidSortingType("Invalid sorting type has given")
programMode["limit"] = self.arguments.limit
return programMode
@staticmethod
def _chooseFrom(choices):
print()
choicesByIndex = [str(x) for x in range(len(choices) + 1)]
for i in range(len(choices)):
print("{indent}[{order}] {mode}".format(
indent=" " * 4, order=i + 1, mode=choices[i]
))
print(" " * 4 + "[0] exit\n")
choice = input("> ")
while not choice.lower() in choices + choicesByIndex + ["exit"]:
print("Invalid input\n")
input("> ")
if choice == "0" or choice == "exit":
sys.exit()
elif choice in choicesByIndex:
return choices[int(choice) - 1]
else:
return choice
def _promptUser(self):
print("select program mode:")
programModes = [
"search", "subreddit", "multireddit",
"submitted", "upvoted", "saved", "log"
]
programMode = self._chooseFrom(programModes)
if programMode == "search":
self.arguments.search = input("\nquery: ")
self.arguments.subreddit = input("\nsubreddit: ")
print("\nselect sort type:")
sortTypes = [
"relevance", "top", "new"
]
sortType = self._chooseFrom(sortTypes)
self.arguments.sort = sortType
print("\nselect time filter:")
timeFilters = [
"hour", "day", "week", "month", "year", "all"
]
timeFilter = self._chooseFrom(timeFilters)
self.arguments.time = timeFilter
if programMode == "subreddit":
subredditInput = input(
"(type frontpage for all subscribed subreddits,\n"
" use plus to seperate multi subreddits:"
" pics+funny+me_irl etc.)\n\n"
"subreddit: ")
self.arguments.subreddit = subredditInput
# while not (subredditInput == "" or subredditInput.lower() == "frontpage"):
# subredditInput = input("subreddit: ")
# self.arguments.subreddit += "+" + subredditInput
if " " in self.arguments.subreddit:
self.arguments.subreddit = "+".join(
self.arguments.subreddit.split())
# DELETE THE PLUS (+) AT THE END
if not subredditInput.lower() == "frontpage" \
and self.arguments.subreddit[-1] == "+":
self.arguments.subreddit = self.arguments.subreddit[:-1]
print("\nselect sort type:")
sortTypes = [
"hot", "top", "new", "rising", "controversial"
]
sortType = self._chooseFrom(sortTypes)
self.arguments.sort = sortType
if sortType in ["top", "controversial"]:
print("\nselect time filter:")
timeFilters = [
"hour", "day", "week", "month", "year", "all"
]
timeFilter = self._chooseFrom(timeFilters)
self.arguments.time = timeFilter
else:
self.arguments.time = "all"
elif programMode == "multireddit":
self.arguments.user = input("\nmultireddit owner: ")
self.arguments.multireddit = input("\nmultireddit: ")
print("\nselect sort type:")
sortTypes = [
"hot", "top", "new", "rising", "controversial"
]
sortType = self._chooseFrom(sortTypes)
self.arguments.sort = sortType
if sortType in ["top", "controversial"]:
print("\nselect time filter:")
timeFilters = [
"hour", "day", "week", "month", "year", "all"
]
timeFilter = self._chooseFrom(timeFilters)
self.arguments.time = timeFilter
else:
self.arguments.time = "all"
elif programMode == "submitted":
self.arguments.submitted = True
self.arguments.user = input("\nredditor: ")
print("\nselect sort type:")
sortTypes = [
"hot", "top", "new", "controversial"
]
sortType = self._chooseFrom(sortTypes)
self.arguments.sort = sortType
if sortType == "top":
print("\nselect time filter:")
timeFilters = [
"hour", "day", "week", "month", "year", "all"
]
timeFilter = self._chooseFrom(timeFilters)
self.arguments.time = timeFilter
else:
self.arguments.time = "all"
elif programMode == "upvoted":
self.arguments.upvoted = True
self.arguments.user = input("\nredditor: ")
elif programMode == "saved":
self.arguments.saved = True
elif programMode == "log":
while True:
self.arguments.log = input("\nlog file directory:")
if Path(self.arguments.log).is_file():
break
while True:
try:
self.arguments.limit = int(input("\nlimit (0 for none): "))
if self.arguments.limit == 0:
self.arguments.limit = None
break
except ValueError:
pass
def _validateProgramMode(self):
"""Check if command-line self.arguments are given correcly,
if not, raise errors
"""
if self.arguments.user is None:
user = 0
else:
user = 1
search = 1 if self.arguments.search else 0
modes = [
"saved",
"subreddit",
"submitted",
"log",
"link",
"upvoted",
"multireddit"]
values = {
x: 0 if getattr(self.arguments, x) is None or
getattr(self.arguments, x) is False
else 1
for x in modes
}
if not sum(values[x] for x in values) == 1:
raise ProgramModeError("Invalid program mode")
if search + values["saved"] == 2:
raise SearchModeError("You cannot search in your saved posts")
if search + values["submitted"] == 2:
raise SearchModeError("You cannot search in submitted posts")
if search + values["upvoted"] == 2:
raise SearchModeError("You cannot search in upvoted posts")
if search + values["log"] == 2:
raise SearchModeError("You cannot search in log files")
if values["upvoted"] + values["submitted"] == 1 and user == 0:
raise RedditorNameError("No redditor name given")

View File

@@ -1,104 +0,0 @@
import praw
import random
import socket
import webbrowser
from prawcore.exceptions import ResponseException
from src.utils import GLOBAL
from src.jsonHelper import JsonFile
from src. errors import RedditLoginFailed
class Reddit:
def __init__(self, refresh_token=None):
self.SCOPES = ['identity', 'history', 'read', 'save']
self.PORT = 7634
self.refresh_token = refresh_token
self.redditInstance = None
self.arguments = {
"client_id": GLOBAL.reddit_client_id,
"client_secret": GLOBAL.reddit_client_secret,
"user_agent": str(socket.gethostname())
}
def begin(self):
if self.refresh_token:
self.arguments["refresh_token"] = self.refresh_token
self.redditInstance = praw.Reddit(**self.arguments)
try:
self.redditInstance.auth.scopes()
return self.redditInstance
except ResponseException:
self.arguments["redirect_uri"] = "http://localhost:" + \
str(self.PORT)
self.redditInstance = praw.Reddit(**self.arguments)
reddit, refresh_token = self.getRefreshToken(*self.SCOPES)
else:
self.arguments["redirect_uri"] = "http://localhost:" + \
str(self.PORT)
self.redditInstance = praw.Reddit(**self.arguments)
reddit, refresh_token = self.getRefreshToken(*self.SCOPES)
JsonFile(GLOBAL.configDirectory).add({
"reddit_username": str(reddit.user.me()),
"reddit": refresh_token
}, "credentials")
return self.redditInstance
def recieve_connection(self):
"""Wait for and then return a connected socket..
Opens a TCP connection on port 8080, and waits for a single client.
"""
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(('0.0.0.0', self.PORT))
server.listen(1)
client = server.accept()[0]
server.close()
return client
@staticmethod
def send_message(client, message):
"""Send message to client and close the connection."""
client.send(
'HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')
)
client.close()
def getRefreshToken(self, *scopes):
state = str(random.randint(0, 65000))
url = self.redditInstance.auth.url(scopes, state, 'permanent')
print("---Setting up the Reddit API---\n")
print(
"Go to this URL and login to reddit:\n",
url,
sep="\n",
end="\n\n")
webbrowser.open(url, new=2)
client = self.recieve_connection()
data = client.recv(1024).decode('utf-8')
str(data)
param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&')
params = dict([token.split('=')
for token in param_tokens])
if state != params['state']:
self.send_message(
client, 'State mismatch. Expected: {} Received: {}'
.format(state, params['state'])
)
raise RedditLoginFailed
if 'error' in params:
self.send_message(client, params['error'])
raise RedditLoginFailed
refresh_token = self.redditInstance.auth.authorize(params['code'])
self.send_message(client,
"<script>"
"alert(\"You can go back to terminal window now.\");"
"</script>"
)
return (self.redditInstance, refresh_token)

View File

@@ -1,374 +0,0 @@
import sys
import time
import urllib.request
from prawcore.exceptions import NotFound, Forbidden
from src.reddit import Reddit
from src.utils import GLOBAL, createLogFile, printToFile
from src.errors import (NoMatchingSubmissionFound, NoPrawSupport,
MultiredditNotFound,
InvalidSortingType, InsufficientPermission)
print = printToFile
def getPosts(programMode):
"""Call PRAW regarding to arguments and pass it to extractDetails.
Return what extractDetails has returned.
"""
reddit = Reddit(GLOBAL.config["credentials"]["reddit"]).begin()
if programMode["sort"] == "best":
raise NoPrawSupport("PRAW does not support that")
if "subreddit" in programMode:
if "search" in programMode:
if programMode["subreddit"] == "frontpage":
programMode["subreddit"] = "all"
if "user" in programMode:
if programMode["user"] == "me":
programMode["user"] = str(reddit.user.me())
if "search" not in programMode:
if programMode["sort"] == "top" or programMode["sort"] == "controversial":
keyword_params = {
"time_filter": programMode["time"],
"limit": programMode["limit"]
}
# OTHER SORT TYPES DON'T TAKE TIME_FILTER
else:
keyword_params = {
"limit": programMode["limit"]
}
else:
keyword_params = {
"time_filter": programMode["time"],
"limit": programMode["limit"]
}
if "search" in programMode:
if programMode["sort"] in ["hot", "rising", "controversial"]:
raise InvalidSortingType("Invalid sorting type has given")
if "subreddit" in programMode:
print(
"search for \"{search}\" in\n"
"subreddit: {subreddit}\nsort: {sort}\n"
"time: {time}\nlimit: {limit}\n".format(
search=programMode["search"],
limit=programMode["limit"],
sort=programMode["sort"],
subreddit=programMode["subreddit"],
time=programMode["time"]
).upper(), noPrint=True
)
return extractDetails(
reddit.subreddit(programMode["subreddit"]).search(
programMode["search"],
limit=programMode["limit"],
sort=programMode["sort"],
time_filter=programMode["time"]
)
)
if "multireddit" in programMode:
raise NoPrawSupport("PRAW does not support that")
if "user" in programMode:
raise NoPrawSupport("PRAW does not support that")
if "saved" in programMode:
raise ("Reddit does not support that")
if programMode["sort"] == "relevance":
raise InvalidSortingType("Invalid sorting type has given")
if "saved" in programMode:
print(
"saved posts\nuser:{username}\nlimit={limit}\n".format(
username=reddit.user.me(),
limit=programMode["limit"]
).upper(), noPrint=True
)
return extractDetails(
reddit.user.me().saved(
limit=programMode["limit"]))
if "subreddit" in programMode:
if programMode["subreddit"] == "frontpage":
print(
"subreddit: {subreddit}\nsort: {sort}\n"
"time: {time}\nlimit: {limit}\n".format(
limit=programMode["limit"],
sort=programMode["sort"],
subreddit=programMode["subreddit"],
time=programMode["time"]
).upper(), noPrint=True
)
return extractDetails(
getattr(reddit.front, programMode["sort"])(**keyword_params)
)
print(
"subreddit: {subreddit}\nsort: {sort}\n"
"time: {time}\nlimit: {limit}\n".format(
limit=programMode["limit"],
sort=programMode["sort"],
subreddit=programMode["subreddit"],
time=programMode["time"]
).upper(), noPrint=True
)
return extractDetails(
getattr(
reddit.subreddit(programMode["subreddit"]), programMode["sort"]
)(**keyword_params)
)
if "multireddit" in programMode:
print(
"user: {user}\n"
"multireddit: {multireddit}\nsort: {sort}\n"
"time: {time}\nlimit: {limit}\n".format(
user=programMode["user"],
limit=programMode["limit"],
sort=programMode["sort"],
multireddit=programMode["multireddit"],
time=programMode["time"]
).upper(), noPrint=True
)
try:
return extractDetails(
getattr(
reddit.multireddit(
programMode["user"], programMode["multireddit"]
), programMode["sort"]
)(**keyword_params)
)
except NotFound:
raise MultiredditNotFound("Multireddit not found")
elif "submitted" in programMode:
print(
"submitted posts of {user}\nsort: {sort}\n"
"time: {time}\nlimit: {limit}\n".format(
limit=programMode["limit"],
sort=programMode["sort"],
user=programMode["user"],
time=programMode["time"]
).upper(), noPrint=True
)
return extractDetails(
getattr(
reddit.redditor(programMode["user"]
).submissions, programMode["sort"]
)(**keyword_params)
)
elif "upvoted" in programMode:
print(
"upvoted posts of {user}\nlimit: {limit}\n".format(
user=programMode["user"],
limit=programMode["limit"]
).upper(), noPrint=True
)
try:
return extractDetails(
reddit.redditor(programMode["user"]).upvoted(
limit=programMode["limit"])
)
except Forbidden:
raise InsufficientPermission(
"You do not have permission to do that")
elif "post" in programMode:
print("post: {post}\n".format(
post=programMode["post"]).upper(), noPrint=True)
return extractDetails(
reddit.submission(url=programMode["post"]), SINGLE_POST=True
)
def extractDetails(posts, SINGLE_POST=False):
"""Check posts and decide if it can be downloaded.
If so, create a dictionary with post details and append them to a list.
Write all of posts to file. Return the list
"""
postList = []
postCount = 1
allPosts = {}
print("\nGETTING POSTS")
postsFile = createLogFile("POSTS")
if SINGLE_POST:
submission = posts
postCount += 1
try:
details = {'POSTID': submission.id,
'TITLE': submission.title,
'REDDITOR': str(submission.author),
'TYPE': None,
'CONTENTURL': submission.url,
'SUBREDDIT': submission.subreddit.display_name,
'UPVOTES': submission.score,
'FLAIR': submission.link_flair_text,
'DATE': str(time.strftime(
"%Y-%m-%d_%H-%M",
time.localtime(submission.created_utc)
))}
if 'gallery' in submission.url:
details['CONTENTURL'] = genLinksifGallery(submission.media_metadata)
except AttributeError:
pass
if not any(
domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
result = matchWithDownloader(submission)
if result is not None:
details = {**details, **result}
postList.append(details)
postsFile.add({postCount: details})
else:
try:
for submission in posts:
if postCount % 100 == 0:
sys.stdout.write("")
sys.stdout.flush()
if postCount % 1000 == 0:
sys.stdout.write("\n" + " " * 14)
sys.stdout.flush()
try:
details = {'POSTID': submission.id,
'TITLE': submission.title,
'REDDITOR': str(submission.author),
'TYPE': None,
'CONTENTURL': submission.url,
'SUBREDDIT': submission.subreddit.display_name,
'UPVOTES': submission.score,
'FLAIR': submission.link_flair_text,
'DATE': str(time.strftime(
"%Y-%m-%d_%H-%M",
time.localtime(submission.created_utc)
))}
if 'gallery' in submission.url:
details['CONTENTURL'] = genLinksifGallery(submission.media_metadata)
except AttributeError:
continue
if details['POSTID'] in GLOBAL.downloadedPosts():
continue
if not any(
domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
result = matchWithDownloader(submission)
if result is not None:
details = {**details, **result}
postList.append(details)
allPosts[postCount] = details
postCount += 1
except KeyboardInterrupt:
print("\nKeyboardInterrupt", noPrint=True)
postsFile.add(allPosts)
if len(postList) != 0:
print()
return postList
raise NoMatchingSubmissionFound("No matching submission was found")
def matchWithDownloader(submission):
if 'gallery' in submission.url:
return{'TYPE':'gallery'}
directLink = extractDirectLink(submission.url)
if directLink:
return {'TYPE': 'direct',
'CONTENTURL': directLink}
if 'v.redd.it' in submission.domain:
bitrates = ["DASH_1080", "DASH_720", "DASH_600",
"DASH_480", "DASH_360", "DASH_240"]
for bitrate in bitrates:
videoURL = submission.url + "/" + bitrate + ".mp4"
try:
responseCode = urllib.request.urlopen(videoURL).getcode()
except urllib.error.HTTPError:
responseCode = 0
if responseCode == 200:
return {'TYPE': 'v.redd.it', 'CONTENTURL': videoURL}
if 'gfycat' in submission.domain:
return {'TYPE': 'gfycat'}
if 'youtube' in submission.domain \
and 'watch' in submission.url:
return {'TYPE': 'youtube'}
if 'youtu.be' in submission.domain:
url = urllib.request.urlopen(submission.url).geturl()
if 'watch' in url:
return {'TYPE': 'youtube'}
elif 'imgur' in submission.domain:
return {'TYPE': 'imgur'}
elif 'erome' in submission.domain:
return {'TYPE': 'erome'}
elif 'redgifs' in submission.domain:
return {'TYPE': 'redgifs'}
elif 'gifdeliverynetwork' in submission.domain:
return {'TYPE': 'gifdeliverynetwork'}
if 'reddit.com/gallery' in submission.url:
return {'TYPE': 'gallery'}
if submission.is_self and 'self' not in GLOBAL.arguments.skip:
return {'TYPE': 'self',
'CONTENT': submission.selftext}
def extractDirectLink(URL):
"""Check if link is a direct image link.
If so, return URL,
if not, return False
"""
imageTypes = ['jpg', 'jpeg', 'png', 'mp4', 'webm', 'gif']
if URL[-1] == "/":
URL = URL[:-1]
if "i.reddituploads.com" in URL:
return URL
for extension in imageTypes:
if extension == URL.split(".")[-1]:
return URL
return None
def genLinksifGallery(metadata):
galleryImgUrls = list()
if metadata is not None:
for key in metadata:
galleryImgUrls.append(metadata[key]['s']['u'].split('?')[0].replace('preview','i'))
return galleryImgUrls

View File

@@ -1,25 +0,0 @@
from os import path
class Store:
def __init__(self, directory=None):
self.directory = directory
if self.directory:
if path.exists(directory):
with open(directory, 'r') as f:
self.list = f.read().split("\n")
else:
with open(self.directory, 'a'):
pass
self.list = []
else:
self.list = []
def __call__(self):
return self.list
def add(self, data):
self.list.append(data)
if self.directory:
with open(self.directory, 'a') as f:
f.write("{data}\n".format(data=data))

View File

@@ -1,100 +0,0 @@
import io
import sys
from os import makedirs, path
from pathlib import Path
from src.jsonHelper import JsonFile
class GLOBAL:
"""Declare global variables"""
RUN_TIME = ""
config = {'imgur_client_id': None, 'imgur_client_secret': None}
arguments = None
directory = None
defaultConfigDirectory = Path.home() / "Bulk Downloader for Reddit"
configDirectory = ""
reddit_client_id = "U-6gk4ZCh3IeNQ"
reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg"
@staticmethod
def downloadedPosts(): return []
printVanilla = print
log_stream = None
def createLogFile(TITLE):
"""Create a log file with given name
inside a folder time stampt in its name and
put given arguments inside \"HEADER\" key
"""
folderDirectory = GLOBAL.directory / "LOG_FILES" / GLOBAL.RUN_TIME
logFilename = TITLE.upper() + '.json'
if not path.exists(folderDirectory):
makedirs(folderDirectory)
FILE = JsonFile(folderDirectory / Path(logFilename))
HEADER = " ".join(sys.argv)
FILE.add({"HEADER": HEADER})
return FILE
def printToFile(*args, noPrint=False, **kwargs):
"""Print to both CONSOLE and
CONSOLE LOG file in a folder time stampt in the name
"""
folderDirectory = GLOBAL.directory / \
Path("LOG_FILES") / Path(GLOBAL.RUN_TIME)
if not noPrint or \
GLOBAL.arguments.verbose or \
"file" in kwargs:
print(*args, **kwargs)
if not path.exists(folderDirectory):
makedirs(folderDirectory)
if "file" not in kwargs:
with io.open(
folderDirectory / "CONSOLE_LOG.txt", "a", encoding="utf-8"
) as FILE:
print(*args, file=FILE, **kwargs)
def nameCorrector(string, reference=None):
"""Swap strange characters from given string
with underscore (_) and shorten it.
Return the string
"""
LIMIT = 247
stringLength = len(string)
if reference:
referenceLenght = len(reference)
totalLenght = referenceLenght
else:
totalLenght = stringLength
if totalLenght > LIMIT:
limit = LIMIT - referenceLenght
string = string[:limit - 1]
string = string.replace(" ", "_")
if len(string.split('\n')) > 1:
string = "".join(string.split('\n'))
BAD_CHARS = ['\\', '/', ':', '*', '?', '"', '<',
'>', '|', '#', '.', '@', '', '', '\'', '!']
string = "".join([i if i not in BAD_CHARS else "_" for i in string])
return string