diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 6fd914b..77c0088 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -4,338 +4,158 @@ This program downloads imgur, gfycat and direct image and video links of saved posts from a reddit account. It is written in Python 3. """ + +import argparse import logging -import os import sys -import time -from io import StringIO -from pathlib import Path -from prawcore.exceptions import InsufficientScope -from bulkredditdownloader.arguments import Arguments -from bulkredditdownloader.config import Config -from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.site_downloaders.erome import Erome -from bulkredditdownloader.site_downloaders.gallery import Gallery -from bulkredditdownloader.site_downloaders.gfycat import Gfycat -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.site_downloaders.imgur import Imgur -from bulkredditdownloader.site_downloaders.redgifs import Redgifs -from bulkredditdownloader.site_downloaders.self_post import SelfPost -from bulkredditdownloader.site_downloaders.vreddit import VReddit -from bulkredditdownloader.site_downloaders.youtube import Youtube -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, - ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, - TypeInSkip, full_exc_info) -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.program_mode import ProgramMode -from bulkredditdownloader.reddit import Reddit -from bulkredditdownloader.searcher import getPosts -from bulkredditdownloader.store import Store -from bulkredditdownloader.utils import GLOBAL, createLogFile, nameCorrector, printToFile +from bulkredditdownloader.downloader import RedditDownloader +from bulkredditdownloader.errors import BulkDownloaderException -from time import sleep - -__author__ = "Ali Parlakci" -__license__ = "GPL" -__version__ = "1.10.0" -__maintainer__ = "Ali Parlakci" -__email__ = "parlakciali@gmail.com" +logger = logging.getLogger() +parser = argparse.ArgumentParser(allow_abbrev=False, + description="This program downloads media from reddit posts") -def postFromLog(filename): - """Analyze a log file and return a list of dictionaries containing - submissions - """ - if Path.is_file(Path(filename)): - content = JsonFile(filename).read() +def _add_options(): + parser.add_argument("directory", + help="Specifies the directory where posts will be downloaded to", + metavar="DIRECTORY") + parser.add_argument("--verbose", "-v", + help="Verbose Mode", + action="store_true", + default=False) + parser.add_argument("--quit", "-q", + help="Auto quit afer the process finishes", + action="store_true", + default=False) + parser.add_argument("--link", "-l", + help="Get posts from link", + metavar="link") + parser.add_argument("--saved", + action="store_true", + required="--unsave" in sys.argv, + help="Triggers saved mode") + parser.add_argument("--unsave", + action="store_true", + help="Unsaves downloaded posts") + parser.add_argument("--submitted", + action="store_true", + help="Gets posts of --user") + parser.add_argument("--upvoted", + action="store_true", + help="Gets upvoted posts of --user") + parser.add_argument("--log", + help="Takes a log file which created by itself (json files),reads posts and tries " + "downloading them again.", + metavar="LOG FILE") + parser.add_argument("--subreddit", + nargs="+", + help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " + "for frontpage", + metavar="SUBREDDIT", + type=str) + parser.add_argument("--multireddit", + help="Triggers multireddit mode and takes multireddit's name without m", + metavar="MULTIREDDIT", + type=str) + parser.add_argument("--user", + help="reddit username if needed. use \"me\" for current user", + required="--multireddit" in sys.argv or "--submitted" in sys.argv, + metavar="redditor", + type=str) + parser.add_argument("--search", + help="Searches for given query in given subreddits", + metavar="query", + type=str) + parser.add_argument("--sort", + help="Either hot, top, new, controversial, rising or relevance default: hot", + choices=["hot", "top", "new", "controversial", "rising", "relevance"], + metavar="SORT TYPE", + type=str) + parser.add_argument("--limit", + help="default: unlimited", + metavar="Limit", + type=int) + parser.add_argument("--time", + help="Either hour, day, week, month, year or all. default: all", + choices=["all", "hour", "day", "week", "month", "year"], + metavar="TIME_LIMIT", + type=str) + parser.add_argument("--skip", + nargs="+", + help="Skip posts with given type", + type=str, + choices=["images", "videos", "gifs", "self"], + default=[]) + parser.add_argument("--skip-domain", + nargs="+", + help="Skip posts with given domain", + type=str, + default=[]) + parser.add_argument("--set-folderpath", + action="store_true", + help="Set custom folderpath", + default='{SUBREDDIT}' + ) + parser.add_argument("--set-filename", + action="store_true", + help="Set custom filename", + default='{REDDITOR}_{TITLE}_{POSTID}' + ) + parser.add_argument("--set-default-directory", + action="store_true", + help="Set a default directory to be used in case no directory is given", + ) + parser.add_argument("--set-default-options", + action="store_true", + help="Set default options to use everytime program runs", + ) + parser.add_argument("--use-local-config", + action="store_true", + help="Creates a config file in the program's directory" + " and uses it. Useful for having multiple configs", + ) + parser.add_argument("--no-dupes", + action="store_true", + help="Do not download duplicate posts on different subreddits", + ) + parser.add_argument("--downloaded-posts", + help="Use a hash file to keep track of downloaded files", + type=str + ) + parser.add_argument("--no-download", + action="store_true", + help="Just saved posts into a the POSTS.json file without downloading" + ) + + +def _setup_logging(verbosity: int): + logger.setLevel(1) + stream = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') + stream.setFormatter(formatter) + logger.addHandler(stream) + if verbosity < 0: + stream.setLevel(logging.INFO) else: - print("File not found") - sys.exit() + stream.setLevel(logging.DEBUG) + logging.getLogger('praw').setLevel(logging.CRITICAL) + logging.getLogger('prawcore').setLevel(logging.CRITICAL) + logging.getLogger('urllib3').setLevel(logging.CRITICAL) + + +def main(args: argparse.Namespace): + _setup_logging(args.verbose) try: - del content["HEADER"] - except KeyError: - pass + reddit_downloader = RedditDownloader(args) + reddit_downloader.download() + except BulkDownloaderException as e: + logger.critical(f'An error occured {e}') - posts = [] - for post in content: - if not content[post][-1]['TYPE'] is None: - posts.append(content[post][-1]) - - return posts - - -def isPostExists(post, directory): - """Figure out a file's name and checks if the file already exists""" - - filename = GLOBAL.config['filename'].format(**post) - - possible_extensions = [".jpg", ".png", ".mp4", ".gif", ".webm", ".md", ".mkv", ".flv"] - - for extension in possible_extensions: - - path = directory / Path(filename + extension) - - if path.exists(): - return True - - return False - - - -def downloadPost(submission, directory): - downloaders = { - "imgur": Imgur, "gfycat": Gfycat, "erome": Erome, "direct": Direct, "self": SelfPost, - "redgifs": Redgifs, "gifdeliverynetwork": GifDeliveryNetwork, - "v.redd.it": VReddit, "youtube": Youtube, "gallery": Gallery - } - - print() - if submission['TYPE'] in downloaders: - downloaders[submission['TYPE']](directory, submission) - else: - raise NoSuitablePost - - -def download(submissions): - """Analyze list of submissions and call the right function - to download each one, catch errors, update the log files - """ - - downloaded_count = 0 - duplicates = 0 - - failed_file = createLogFile("FAILED") - - if GLOBAL.arguments.unsave: - reddit = Reddit(GLOBAL.config['credentials']['reddit']).begin() - - subs_length = len(submissions) - - for i in range(len(submissions)): - print(f"\n({i+1}/{subs_length})", end=" — ") - print(submissions[i]['POSTID'], - f"r/{submissions[i]['SUBREDDIT']}", - f"u/{submissions[i]['REDDITOR']}", - submissions[i]['FLAIR'] if submissions[i]['FLAIR'] else "", - sep=" — ", - end="") - print(f" – {submissions[i]['TYPE'].upper()}", end="", no_print=True) - - directory = GLOBAL.directory / \ - GLOBAL.config["folderpath"].format(**submissions[i]) - details = { - **submissions[i], - **{"TITLE": nameCorrector( - submissions[i]['TITLE'], - reference=str(directory) - + GLOBAL.config['filename'].format(**submissions[i]) - + ".ext")} - } - filename = GLOBAL.config['filename'].format(**details) - - if isPostExists(details, directory): - print() - print(directory) - print(filename) - print("It already exists") - duplicates += 1 - continue - - if any(domain in submissions[i]['CONTENTURL'] for domain in GLOBAL.arguments.skip): - print() - print(submissions[i]['CONTENTURL']) - print("Domain found in skip domains, skipping post...") - continue - - try: - downloadPost(details, directory) - GLOBAL.downloadedPosts.add(details['POSTID']) - - try: - if GLOBAL.arguments.unsave: - reddit.submission(id=details['POSTID']).unsave() - except InsufficientScope: - reddit = Reddit().begin() - reddit.submission(id=details['POSTID']).unsave() - - downloaded_count += 1 - - except FileAlreadyExistsError: - print("It already exists") - GLOBAL.downloadedPosts.add(details['POSTID']) - duplicates += 1 - - except ImgurLoginError: - print("Imgur login failed. \nQuitting the program as unexpected errors might occur.") - sys.exit() - - except ImgurLimitError as exception: - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), details - ]}) - - except NotADownloadableLinkError as exception: - print("{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))) - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), - submissions[i] - ]}) - - except TypeInSkip: - print() - print(submissions[i]['CONTENTURL']) - print("Skipping post...") - - except DomainInSkip: - print() - print(submissions[i]['CONTENTURL']) - print("Skipping post...") - - except NoSuitablePost: - print("No match found, skipping...") - - except FailedToDownload: - print("Failed to download the posts, skipping...") - except AlbumNotDownloadedCompletely: - print("Album did not downloaded completely.") - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), - submissions[i] - ]}) - - except Exception as exc: - print("{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exc.__class__.__name__, info=str(exc)) - ) - - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), no_print=True) - - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), - submissions[i] - ]}) - - if duplicates: - print(f"\nThere {'were' if duplicates > 1 else 'was'} {duplicates} duplicate{'s' if duplicates > 1 else ''}") - - if downloaded_count == 0: - print("Nothing is downloaded :(") - - else: - print(f"Total of {downloaded_count} link{'s' if downloaded_count > 1 else ''} downloaded!") - - -def printLogo(): - VanillaPrint(f"\nBulk Downloader for Reddit v{__version__}\n" - f"Written by Ali PARLAKCI – parlakciali@gmail.com\n\n" - f"https://github.com/aliparlakci/bulk-downloader-for-reddit/\n" - ) - - -def main(): - if Path("config.json").exists(): - GLOBAL.configDirectory = Path("config.json") - else: - if not Path(GLOBAL.defaultConfigDirectory).is_dir(): - os.makedirs(GLOBAL.defaultConfigDirectory) - GLOBAL.configDirectory = GLOBAL.defaultConfigDirectory / "config.json" - try: - GLOBAL.config = Config(GLOBAL.configDirectory).generate() - except InvalidJSONFile as exception: - VanillaPrint(str(exception.__class__.__name__), ">>", str(exception)) - VanillaPrint("Resolve it or remove it to proceed") - sys.exit() - - sys.argv = sys.argv + GLOBAL.config["options"].split() - - arguments = Arguments.parse() - GLOBAL.arguments = arguments - - if arguments.set_filename: - Config(GLOBAL.configDirectory).setCustomFileName() - sys.exit() - - if arguments.set_folderpath: - Config(GLOBAL.configDirectory).setCustomFolderPath() - sys.exit() - - if arguments.set_default_directory: - Config(GLOBAL.configDirectory).setDefaultDirectory() - sys.exit() - - if arguments.set_default_options: - Config(GLOBAL.configDirectory).setDefaultOptions() - sys.exit() - - if arguments.use_local_config: - JsonFile("config.json").add(GLOBAL.config) - sys.exit() - - if arguments.directory: - GLOBAL.directory = Path(arguments.directory.strip()) - elif "default_directory" in GLOBAL.config and GLOBAL.config["default_directory"] != "": - GLOBAL.directory = Path( - GLOBAL.config["default_directory"].format(time=GLOBAL.RUN_TIME)) - else: - GLOBAL.directory = Path(input("\ndownload directory: ").strip()) - - if arguments.downloaded_posts: - GLOBAL.downloadedPosts = Store(arguments.downloaded_posts) - else: - GLOBAL.downloadedPosts = Store() - - printLogo() - print("\n", " ".join(sys.argv), "\n", no_print=True) - - if arguments.log is not None: - log_dir = Path(arguments.log) - download(postFromLog(log_dir)) - sys.exit() - - program_mode = ProgramMode(arguments).generate() - - try: - posts = getPosts(program_mode) - except Exception as exc: - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), no_print=True) - print(exc) - sys.exit() - - if posts is None: - print("I could not find any posts in that URL") - sys.exit() - - if GLOBAL.arguments.no_download: - pass - else: - download(posts) - - -if __name__ == "__main__": - - GLOBAL.log_stream = StringIO() - logging.basicConfig(stream=GLOBAL.log_stream, level=logging.INFO) - - try: - VanillaPrint = print - print = printToFile - GLOBAL.RUN_TIME = str(time.strftime("%d-%m-%Y_%H-%M-%S", time.localtime(time.time()))) - main() - - except KeyboardInterrupt: - if GLOBAL.directory is None: - GLOBAL.directory = Path("../..\\") - - except Exception as exception: - if GLOBAL.directory is None: - GLOBAL.directory = Path("../..\\") - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue()) - - if not GLOBAL.arguments.quit: - input("\nPress enter to quit\n") +if __name__ == '__main__': + _add_options() + args = parser.parse_args() + main(args) diff --git a/bulkredditdownloader/arguments.py b/bulkredditdownloader/arguments.py deleted file mode 100644 index cbf72c7..0000000 --- a/bulkredditdownloader/arguments.py +++ /dev/null @@ -1,153 +0,0 @@ -import argparse -import sys - - -class Arguments: - @staticmethod - def parse(arguments=None): - """Initialize argparse and add arguments""" - if arguments is None: - arguments = [] - - parser = argparse.ArgumentParser(allow_abbrev=False, - description="This program downloads media from reddit posts") - parser.add_argument("--directory", "-d", - help="Specifies the directory where posts will be downloaded to", - metavar="DIRECTORY") - - parser.add_argument("--verbose", "-v", - help="Verbose Mode", - action="store_true", - default=False) - - parser.add_argument("--quit", "-q", - help="Auto quit afer the process finishes", - action="store_true", - default=False) - - parser.add_argument("--link", "-l", - help="Get posts from link", - metavar="link") - - parser.add_argument("--saved", - action="store_true", - required="--unsave" in sys.argv, - help="Triggers saved mode") - - parser.add_argument("--unsave", - action="store_true", - help="Unsaves downloaded posts") - - parser.add_argument("--submitted", - action="store_true", - help="Gets posts of --user") - - parser.add_argument("--upvoted", - action="store_true", - help="Gets upvoted posts of --user") - - parser.add_argument("--log", - help="Takes a log file which created by itself (json files),reads posts and tries " - "downloading them again.", - # type=argparse.FileType('r'), - metavar="LOG FILE") - - parser.add_argument("--subreddit", - nargs="+", - help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " - "for frontpage", - metavar="SUBREDDIT", - type=str) - - parser.add_argument("--multireddit", - help="Triggers multireddit mode and takes multireddit's name without m", - metavar="MULTIREDDIT", - type=str) - - parser.add_argument("--user", - help="reddit username if needed. use \"me\" for current user", - required="--multireddit" in sys.argv or "--submitted" in sys.argv, - metavar="redditor", - type=str) - - parser.add_argument( - "--search", - help="Searches for given query in given subreddits", - metavar="query", - type=str) - - parser.add_argument("--sort", - help="Either hot, top, new, controversial, rising or relevance default: hot", - choices=["hot", "top", "new", "controversial", "rising", "relevance"], - metavar="SORT TYPE", - type=str) - - parser.add_argument("--limit", - help="default: unlimited", - metavar="Limit", - type=int) - - parser.add_argument("--time", - help="Either hour, day, week, month, year or all. default: all", - choices=["all", "hour", "day", "week", "month", "year"], - metavar="TIME_LIMIT", - type=str) - - parser.add_argument("--skip", - nargs="+", - help="Skip posts with given type", - type=str, - choices=["images", "videos", "gifs", "self"], - default=[]) - - parser.add_argument("--skip-domain", - nargs="+", - help="Skip posts with given domain", - type=str, - default=[]) - - parser.add_argument("--set-folderpath", - action="store_true", - help="Set custom folderpath" - ) - - parser.add_argument("--set-filename", - action="store_true", - help="Set custom filename", - ) - - parser.add_argument("--set-default-directory", - action="store_true", - help="Set a default directory to be used in case no directory is given", - ) - - parser.add_argument("--set-default-options", - action="store_true", - help="Set default options to use everytime program runs", - ) - - parser.add_argument("--use-local-config", - action="store_true", - help="Creates a config file in the program's directory" - " and uses it. Useful for having multiple configs", - ) - - parser.add_argument("--no-dupes", - action="store_true", - help="Do not download duplicate posts on different subreddits", - ) - - parser.add_argument("--downloaded-posts", - help="Use a hash file to keep track of downloaded files", - type=str - ) - - parser.add_argument("--no-download", - action="store_true", - help="Just saved posts into a the POSTS.json file without downloading" - ) - - if not arguments: - return parser.parse_args() - else: - return parser.parse_args(arguments) diff --git a/bulkredditdownloader/config.py b/bulkredditdownloader/config.py deleted file mode 100644 index 36dec10..0000000 --- a/bulkredditdownloader/config.py +++ /dev/null @@ -1,109 +0,0 @@ -from bulkredditdownloader.reddit import Reddit -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.utils import nameCorrector - - -class Config: - - def __init__(self, filename: str): - self.filename = filename - self.file = JsonFile(self.filename) - - def generate(self) -> dict: - self._validateCredentials() - self._readCustomFileName() - self._readCustomFolderPath() - self._readDefaultOptions() - return self.file.read() - - def setCustomFileName(self): - print(""" -IMPORTANT: Do not change the filename structure frequently. - If you did, the program could not find duplicates and - would download the already downloaded files again. - This would not create any duplicates in the directory but - the program would not be as snappy as it should be. - -Type a template file name for each post. - -You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces -The text in curly braces will be replaced with the corresponding property of an each post - -For example: {FLAIR}_{SUBREDDIT}_{REDDITOR} - -Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"]) - - filename = nameCorrector(input(">> ").upper()) - self.file.add({"filename": filename}) - - def _readCustomFileName(self): - content = self.file.read() - - if "filename" not in content: - self.file.add({"filename": "{REDDITOR}_{TITLE}_{POSTID}"}) - content = self.file.read() - - if "{POSTID}" not in content["filename"]: - self.file.add({"filename": content["filename"] + "_{POSTID}"}) - - def setCustomFolderPath(self): - print(""" -Type a folder structure (generic folder path) - -Use slash or DOUBLE backslash to separate folders - -You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces -The text in curly braces will be replaced with the corresponding property of an each post - -For example: {REDDITOR}/{SUBREDDIT}/{FLAIR} - -Existing folder structure""", None if "folderpath" not in self.file.read() else self.file.read()["folderpath"]) - - folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper()) - - self.file.add({"folderpath": folderpath}) - - def _readCustomFolderPath(self, path=None): - content = self.file.read() - if "folderpath" not in content: - self.file.add({"folderpath": "{SUBREDDIT}"}) - - def setDefaultOptions(self): - print(""" -Type options to be used everytime script runs - -For example: --no-dupes --quit --limit 100 --skip youtube.com - -Existing default options:""", None if "options" not in self.file.read() else self.file.read()["options"]) - - options = input(">> ").strip("") - - self.file.add({"options": options}) - - def _readDefaultOptions(self): - content = self.file.read() - if "options" not in content: - self.file.add({"options": ""}) - - def _validateCredentials(self): - """Read credentials from config.json file""" - try: - content = self.file.read()["credentials"] - except BaseException: - self.file.add({"credentials": {}}) - content = self.file.read()["credentials"] - - if "reddit" in content and len(content["reddit"]) != 0: - pass - else: - Reddit().begin() - print() - - def setDefaultDirectory(self): - print("""Set a default directory to use in case no directory is given -Leave blank to reset it. You can use {time} in foler names to use to timestamp it -For example: D:/archive/BDFR_{time} -""") - print("Current default directory:", self.file.read()[ - "default_directory"] if "default_directory" in self.file.read() else "") - self.file.add({"default_directory": input(">> ")}) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py new file mode 100644 index 0000000..fc10ee0 --- /dev/null +++ b/bulkredditdownloader/downloader.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import argparse +import configparser +import logging +import socket +from datetime import datetime +from enum import Enum, auto +from pathlib import Path + +import appdirs +import praw +import praw.models + +from bulkredditdownloader.download_filter import DownloadFilter +from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError +from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory + +logger = logging.getLogger(__name__) + + +class RedditTypes: + class SortType(Enum): + HOT = auto() + RISING = auto() + CONTROVERSIAL = auto() + NEW = auto() + RELEVENCE = auto() + + class TimeType(Enum): + HOUR = auto() + DAY = auto() + WEEK = auto() + MONTH = auto() + YEAR = auto() + ALL = auto() + + +class RedditDownloader: + def __init__(self, args: argparse.Namespace): + self.config_directories = appdirs.AppDirs('bulk_reddit_downloader') + self.run_time = datetime.now().isoformat() + self._setup_internal_objects(args) + + self.reddit_lists = self._retrieve_reddit_lists(args) + + def _setup_internal_objects(self, args: argparse.Namespace): + self.download_filter = RedditDownloader._create_download_filter(args) + self.time_filter = RedditDownloader._create_time_filter(args) + self.sort_filter = RedditDownloader._create_sort_filter(args) + self.file_name_formatter = RedditDownloader._create_file_name_formatter(args) + self._determine_directories(args) + self.master_hash_list = [] + self._load_config(args) + if self.cfg_parser.has_option('DEFAULT', 'username') and self.cfg_parser.has_option('DEFAULT', 'password'): + self.authenticated = True + + self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + username=self.cfg_parser.get('DEFAULT', 'username'), + password=self.cfg_parser.get('DEFAULT', 'password')) + else: + self.authenticated = False + self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname()) + + def _retrieve_reddit_lists(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + master_list = [] + master_list.extend(self._get_subreddits(args)) + master_list.extend(self._get_multireddits(args)) + master_list.extend(self._get_user_data(args)) + return master_list + + def _determine_directories(self, args: argparse.Namespace): + self.download_directory = Path(args.directory) + self.logfile_directory = self.download_directory / 'LOG_FILES' + self.config_directory = self.config_directories.user_config_dir + + def _load_config(self, args: argparse.Namespace): + self.cfg_parser = configparser.ConfigParser() + if args.use_local_config and Path('./config.cfg').exists(): + self.cfg_parser.read(Path('./config.cfg')) + else: + self.cfg_parser.read(Path('./default_config.cfg').resolve()) + + def _get_subreddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if args.subreddit: + subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in args.subreddit] + if self.sort_filter is RedditTypes.SortType.NEW: + sort_function = praw.models.Subreddit.new + elif self.sort_filter is RedditTypes.SortType.RISING: + sort_function = praw.models.Subreddit.rising + elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: + sort_function = praw.models.Subreddit.controversial + else: + sort_function = praw.models.Subreddit.hot + return [sort_function(reddit) for reddit in subreddits] + else: + return [] + + def _get_multireddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if args.multireddit: + if self.authenticated: + return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in args.multireddit] + else: + raise RedditAuthenticationError('Accessing multireddits requires authentication') + else: + return [] + + def _get_user_data(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if any((args.upvoted, args.submitted, args.saved)): + if self.authenticated: + generators = [] + if args.upvoted: + generators.append(self.reddit_instance.redditor(args.user).upvoted) + if args.submitted: + generators.append(self.reddit_instance.redditor(args.user).submissions) + if args.saved: + generators.append(self.reddit_instance.redditor(args.user).saved) + + return generators + else: + raise RedditAuthenticationError('Accessing user lists requires authentication') + else: + return [] + + @staticmethod + def _create_file_name_formatter(args: argparse.Namespace) -> FileNameFormatter: + return FileNameFormatter(args.set_filename, args.set_folderpath) + + @staticmethod + def _create_time_filter(args: argparse.Namespace) -> RedditTypes.TimeType: + try: + return RedditTypes.TimeType[args.sort.upper()] + except (KeyError, AttributeError): + return RedditTypes.TimeType.ALL + + @staticmethod + def _create_sort_filter(args: argparse.Namespace) -> RedditTypes.SortType: + try: + return RedditTypes.SortType[args.time.upper()] + except (KeyError, AttributeError): + return RedditTypes.SortType.HOT + + @staticmethod + def _create_download_filter(args: argparse.Namespace) -> DownloadFilter: + formats = { + "videos": [".mp4", ".webm"], + "images": [".jpg", ".jpeg", ".png", ".bmp"], + "gifs": [".gif"], + "self": [] + } + excluded_extensions = [extension for ext_type in args.skip for extension in formats.get(ext_type, ())] + return DownloadFilter(excluded_extensions, args.skip_domain) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + self._download_submission(submission) + + def _download_submission(self, submission: praw.models.Submission): + # TODO: check existence here + if self.download_filter.check_url(submission.url): + try: + downloader_class = DownloadFactory.pull_lever(submission.url) + downloader = downloader_class(self.download_directory, submission) + content = downloader.download() + for res in content: + destination = self.file_name_formatter.format_path(res, self.download_directory) + if res.hash.hexdigest() not in self.master_hash_list: + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug('Written file to {}'.format(destination)) + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + + logger.info('Downloaded submission {}'.format(submission.name)) + except NotADownloadableLinkError as e: + logger.error('Could not download submission {}: {}'.format(submission.name, e)) diff --git a/bulkredditdownloader/errors.py b/bulkredditdownloader/errors.py index 7bf47b9..d7c5041 100644 --- a/bulkredditdownloader/errors.py +++ b/bulkredditdownloader/errors.py @@ -1,137 +1,28 @@ -import sys +#!/usr/bin/env - -def full_exc_info(exc_info): - - def current_stack(skip=0): - try: - 1 / 0 - except ZeroDivisionError: - f = sys.exc_info()[2].tb_frame - for i in range(skip + 2): - f = f.f_back - lst = [] - while f is not None: - lst.append((f, f.f_lineno)) - f = f.f_back - return lst - - def extend_traceback(tb, stack): - - class FauxTb(): - def __init__(self, tb_frame, tb_lineno, tb_next): - self.tb_frame = tb_frame - self.tb_lineno = tb_lineno - self.tb_next = tb_next - - """Extend traceback with stack info.""" - head = tb - for tb_frame, tb_lineno in stack: - head = FauxTb(tb_frame, tb_lineno, head) - return head - - """Like sys.exc_info, but includes the full traceback.""" - t, v, tb = exc_info - full_tb = extend_traceback(tb, current_stack(1)) - return t, v, full_tb - - -class RedditLoginFailed(Exception): +class BulkDownloaderException(Exception): pass -class ImgurLoginError(Exception): +class NotADownloadableLinkError(BulkDownloaderException): pass -class FileAlreadyExistsError(Exception): +class RedditAuthenticationError(BulkDownloaderException): pass -class NotADownloadableLinkError(Exception): +class InvalidJSONFile(BulkDownloaderException): pass -class AlbumNotDownloadedCompletely(Exception): +class FailedToDownload(BulkDownloaderException): pass -class FileNameTooLong(Exception): +class ImageNotFound(BulkDownloaderException): pass -class InvalidRedditLink(Exception): - pass - - -class ProgramModeError(Exception): - pass - - -class SearchModeError(Exception): - pass - - -class RedditorNameError(Exception): - pass - - -class NoMatchingSubmissionFound(Exception): - pass - - -class NoPrawSupport(Exception): - pass - - -class NoRedditSupport(Exception): - pass - - -class MultiredditNotFound(Exception): - pass - - -class InsufficientPermission(Exception): - pass - - -class InvalidSortingType(Exception): - pass - - - -class NoSuitablePost(Exception): - pass - - -class ImgurLimitError(Exception): - pass - - -class DirectLinkNotFound(Exception): - pass - - -class InvalidJSONFile(Exception): - pass - - -class FailedToDownload(Exception): - pass - - -class TypeInSkip(Exception): - pass - - -class DomainInSkip(Exception): - pass - - -class ImageNotFound(Exception): - pass - - -class ExtensionError(Exception): +class ExtensionError(BulkDownloaderException): pass diff --git a/bulkredditdownloader/parser.py b/bulkredditdownloader/parser.py deleted file mode 100644 index e8a38f7..0000000 --- a/bulkredditdownloader/parser.py +++ /dev/null @@ -1,234 +0,0 @@ -from pprint import pprint - -try: - from bulkredditdownloader.errors import InvalidRedditLink -except ModuleNotFoundError: - from errors import InvalidRedditLink - - -def QueryParser(passed_queries: str) -> dict: - extracted_queries = {} - - question_mark_index = passed_queries.index("?") - header = passed_queries[:question_mark_index] - extracted_queries["HEADER"] = header - queries = passed_queries[question_mark_index + 1:] - - parsed_queries = queries.split("&") - - for query in parsed_queries: - query = query.split("=") - extracted_queries[query[0]] = query[1] - - if extracted_queries["HEADER"] == "search": - extracted_queries["q"] = extracted_queries["q"].replace("%20", " ") - - return extracted_queries - - -def LinkParser(link: str) -> dict: - result = {} - short_link = False - - if "reddit.com" not in link: - raise InvalidRedditLink("Invalid reddit link") - - splitted_link = link.split("/") - - if splitted_link[0] == "https:" or splitted_link[0] == "http:": - splitted_link = splitted_link[2:] - - try: - if (splitted_link[-2].endswith("reddit.com") and - splitted_link[-1] == "") or splitted_link[-1].endswith("reddit.com"): - - result["sort"] = "best" - return result - except IndexError: - if splitted_link[0].endswith("reddit.com"): - result["sort"] = "best" - return result - - if "redd.it" in splitted_link: - short_link = True - - if splitted_link[0].endswith("reddit.com"): - splitted_link = splitted_link[1:] - - if "comments" in splitted_link: - result = {"post": link} - return result - - elif "me" in splitted_link or \ - "u" in splitted_link or \ - "user" in splitted_link or \ - "r" in splitted_link or \ - "m" in splitted_link: - - if "r" in splitted_link: - result["subreddit"] = splitted_link[splitted_link.index("r") + 1] - - elif "m" in splitted_link: - result["multireddit"] = splitted_link[splitted_link.index("m") + 1] - result["user"] = splitted_link[splitted_link.index("m") - 1] - - else: - for index in range(len(splitted_link)): - if splitted_link[index] == "u" or splitted_link[index] == "user": - result["user"] = splitted_link[index + 1] - - elif splitted_link[index] == "me": - result["user"] = "me" - - for index in range(len(splitted_link)): - if splitted_link[index] in ["hot", "top", "new", "controversial", "rising"]: - - result["sort"] = splitted_link[index] - - if index == 0: - result["subreddit"] = "frontpage" - - elif splitted_link[index] in ["submitted", "saved", "posts", "upvoted"]: - if splitted_link[index] == "submitted" or splitted_link[index] == "posts": - result["submitted"] = {} - - elif splitted_link[index] == "saved": - result["saved"] = True - - elif splitted_link[index] == "upvoted": - result["upvoted"] = True - - elif "?" in splitted_link[index]: - parsed_query = QueryParser(splitted_link[index]) - if parsed_query["HEADER"] == "search": - del parsed_query["HEADER"] - result["search"] = parsed_query - - elif parsed_query["HEADER"] == "submitted" or \ - parsed_query["HEADER"] == "posts": - del parsed_query["HEADER"] - result["submitted"] = parsed_query - - else: - del parsed_query["HEADER"] - result["queries"] = parsed_query - - if not ("upvoted" in result or - "saved" in result or - "submitted" in result or - "multireddit" in result) and "user" in result: - result["submitted"] = {} - - return result - - -def LinkDesigner(link) -> dict: - attributes = LinkParser(link) - mode = {} - - if "post" in attributes: - mode["post"] = attributes["post"] - mode["sort"] = "" - mode["time"] = "" - return mode - - elif "search" in attributes: - mode["search"] = attributes["search"]["q"] - - if "restrict_sr" in attributes["search"]: - - if not (attributes["search"]["restrict_sr"] == 0 or - attributes["search"]["restrict_sr"] == "off" or - attributes["search"]["restrict_sr"] == ""): - - if "subreddit" in attributes: - mode["subreddit"] = attributes["subreddit"] - elif "multireddit" in attributes: - mode["multreddit"] = attributes["multireddit"] - mode["user"] = attributes["user"] - else: - mode["subreddit"] = "all" - else: - mode["subreddit"] = "all" - - if "t" in attributes["search"]: - mode["time"] = attributes["search"]["t"] - else: - mode["time"] = "all" - - if "sort" in attributes["search"]: - mode["sort"] = attributes["search"]["sort"] - else: - mode["sort"] = "relevance" - - if "include_over_18" in attributes["search"]: - if attributes["search"]["include_over_18"] == 1 or attributes["search"]["include_over_18"] == "on": - mode["nsfw"] = True - else: - mode["nsfw"] = False - - else: - if "queries" in attributes: - if not ("submitted" in attributes or "posts" in attributes): - - if "t" in attributes["queries"]: - mode["time"] = attributes["queries"]["t"] - else: - mode["time"] = "day" - else: - if "t" in attributes["queries"]: - mode["time"] = attributes["queries"]["t"] - else: - mode["time"] = "all" - - if "sort" in attributes["queries"]: - mode["sort"] = attributes["queries"]["sort"] - else: - mode["sort"] = "new" - else: - mode["time"] = "day" - - if "subreddit" in attributes and "search" not in attributes: - mode["subreddit"] = attributes["subreddit"] - - elif "user" in attributes and "search" not in attributes: - mode["user"] = attributes["user"] - - if "submitted" in attributes: - mode["submitted"] = True - if "sort" in attributes["submitted"]: - mode["sort"] = attributes["submitted"]["sort"] - elif "sort" in mode: - pass - else: - mode["sort"] = "new" - - if "t" in attributes["submitted"]: - mode["time"] = attributes["submitted"]["t"] - else: - mode["time"] = "all" - - elif "saved" in attributes: - mode["saved"] = True - - elif "upvoted" in attributes: - mode["upvoted"] = True - - elif "multireddit" in attributes: - mode["multireddit"] = attributes["multireddit"] - - if "sort" in attributes: - mode["sort"] = attributes["sort"] - elif "sort" in mode: - pass - else: - mode["sort"] = "hot" - - return mode - - - -if __name__ == "__main__": - while True: - link = input("> ") - pprint(LinkDesigner(link)) diff --git a/bulkredditdownloader/program_mode.py b/bulkredditdownloader/program_mode.py deleted file mode 100644 index f2361ac..0000000 --- a/bulkredditdownloader/program_mode.py +++ /dev/null @@ -1,241 +0,0 @@ -import sys -from pathlib import Path - -from bulkredditdownloader.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError -from bulkredditdownloader.parser import LinkDesigner -import argparse - - - -class ProgramMode: - - def __init__(self, arguments: argparse.Namespace): - self.arguments = arguments - - def generate(self) -> dict: - try: - self._validateProgramMode() - except ProgramModeError: - self._promptUser() - - program_mode = {} - - if self.arguments.user is not None: - program_mode["user"] = self.arguments.user - - if self.arguments.search is not None: - program_mode["search"] = self.arguments.search - if self.arguments.sort == "hot" or \ - self.arguments.sort == "controversial" or \ - self.arguments.sort == "rising": - self.arguments.sort = "relevance" - - if self.arguments.sort is not None: - program_mode["sort"] = self.arguments.sort - else: - if self.arguments.submitted: - program_mode["sort"] = "new" - else: - program_mode["sort"] = "hot" - - if self.arguments.time is not None: - program_mode["time"] = self.arguments.time - else: - program_mode["time"] = "all" - - if self.arguments.link is not None: - self.arguments.link = self.arguments.link.strip("\"") - - program_mode = LinkDesigner(self.arguments.link) - - if self.arguments.search is not None: - program_mode["search"] = self.arguments.search - - if self.arguments.sort is not None: - program_mode["sort"] = self.arguments.sort - - if self.arguments.time is not None: - program_mode["time"] = self.arguments.time - - elif self.arguments.subreddit is not None: - if isinstance(self.arguments.subreddit, list): - self.arguments.subreddit = "+".join(self.arguments.subreddit) - - program_mode["subreddit"] = self.arguments.subreddit - - elif self.arguments.multireddit is not None: - program_mode["multireddit"] = self.arguments.multireddit - - elif self.arguments.saved is True: - program_mode["saved"] = True - - elif self.arguments.upvoted is True: - program_mode["upvoted"] = True - - elif self.arguments.submitted is not None: - program_mode["submitted"] = True - - if self.arguments.sort == "rising": - raise InvalidSortingType("Invalid sorting type has given") - - program_mode["limit"] = self.arguments.limit - - return program_mode - - @staticmethod - def _chooseFrom(choices: list[str]): - print() - choices_by_index = list(str(x) for x in range(len(choices) + 1)) - for i in range(len(choices)): - print("{indent}[{order}] {mode}".format(indent=" " * 4, order=i + 1, mode=choices[i])) - print(" " * 4 + "[0] exit\n") - choice = input("> ") - while not choice.lower() in choices + choices_by_index + ["exit"]: - print("Invalid input\n") - input("> ") - - if choice == "0" or choice == "exit": - sys.exit() - elif choice in choices_by_index: - return choices[int(choice) - 1] - else: - return choice - - def _promptUser(self): - print("select program mode:") - program_modes = ["search", "subreddit", "multireddit", "submitted", "upvoted", "saved", "log"] - program_mode = self._chooseFrom(program_modes) - - if program_mode == "search": - self.arguments.search = input("\nquery: ") - self.arguments.subreddit = input("\nsubreddit: ") - - print("\nselect sort type:") - sort_types = ["relevance", "top", "new"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - - if program_mode == "subreddit": - subreddit_input = input("(type frontpage for all subscribed subreddits,\n" - " use plus to seperate multi subreddits:" - " pics+funny+me_irl etc.)\n\n" - "subreddit: ") - self.arguments.subreddit = subreddit_input - - if " " in self.arguments.subreddit: - self.arguments.subreddit = "+".join( - self.arguments.subreddit.split()) - - # DELETE THE PLUS (+) AT THE END - if not subreddit_input.lower() == "frontpage" and self.arguments.subreddit[-1] == "+": - self.arguments.subreddit = self.arguments.subreddit[:-1] - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "rising", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type in ["top", "controversial"]: - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "multireddit": - self.arguments.user = input("\nmultireddit owner: ") - self.arguments.multireddit = input("\nmultireddit: ") - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "rising", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type in ["top", "controversial"]: - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "submitted": - self.arguments.submitted = True - self.arguments.user = input("\nredditor: ") - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type == "top": - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "upvoted": - self.arguments.upvoted = True - self.arguments.user = input("\nredditor: ") - - elif program_mode == "saved": - self.arguments.saved = True - - elif program_mode == "log": - while True: - self.arguments.log = input("\nlog file directory:") - if Path(self.arguments.log).is_file(): - break - while True: - try: - self.arguments.limit = int(input("\nlimit (0 for none): ")) - if self.arguments.limit == 0: - self.arguments.limit = None - break - except ValueError: - pass - - def _validateProgramMode(self): - """Check if command-line self.arguments are given correcly, - if not, raise errors - """ - if self.arguments.user is None: - user = 0 - else: - user = 1 - - search = 1 if self.arguments.search else 0 - - modes = ["saved", "subreddit", "submitted", "log", "link", "upvoted", "multireddit"] - - values = {x: 0 if getattr(self.arguments, x) is None or - getattr(self.arguments, x) is False - else 1 - for x in modes - } - - if not sum(values[x] for x in values) == 1: - raise ProgramModeError("Invalid program mode") - - if search + values["saved"] == 2: - raise SearchModeError("You cannot search in your saved posts") - - if search + values["submitted"] == 2: - raise SearchModeError("You cannot search in submitted posts") - - if search + values["upvoted"] == 2: - raise SearchModeError("You cannot search in upvoted posts") - - if search + values["log"] == 2: - raise SearchModeError("You cannot search in log files") - - if values["upvoted"] + values["submitted"] == 1 and user == 0: - raise RedditorNameError("No redditor name given") diff --git a/bulkredditdownloader/reddit.py b/bulkredditdownloader/reddit.py deleted file mode 100644 index 392b860..0000000 --- a/bulkredditdownloader/reddit.py +++ /dev/null @@ -1,91 +0,0 @@ -import random -import socket -import webbrowser - -import praw -from prawcore.exceptions import ResponseException - -from bulkredditdownloader.errors import RedditLoginFailed -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.utils import GLOBAL - - - -class Reddit: - - def __init__(self, refresh_token: str = None): - self.SCOPES = ['identity', 'history', 'read', 'save'] - self.PORT = 7634 - self.refresh_token = refresh_token - self.redditInstance = None - self.arguments = { - "client_id": GLOBAL.reddit_client_id, - "client_secret": GLOBAL.reddit_client_secret, - "user_agent": str(socket.gethostname()) - } - - def begin(self) -> praw.Reddit: - if self.refresh_token: - self.arguments["refresh_token"] = self.refresh_token - self.redditInstance = praw.Reddit(**self.arguments) - try: - self.redditInstance.auth.scopes() - return self.redditInstance - except ResponseException: - self.arguments["redirect_uri"] = "http://localhost:" + \ - str(self.PORT) - self.redditInstance = praw.Reddit(**self.arguments) - reddit, refresh_token = self.getRefreshToken(*self.SCOPES) - else: - self.arguments["redirect_uri"] = "http://localhost:" + \ - str(self.PORT) - self.redditInstance = praw.Reddit(**self.arguments) - reddit, refresh_token = self.getRefreshToken(*self.SCOPES) - - JsonFile(GLOBAL.configDirectory).add({"reddit_username": str( - reddit.user.me()), "reddit": refresh_token}, "credentials") - return self.redditInstance - - def recieve_connection(self) -> socket: - """Wait for and then return a connected socket.. - Opens a TCP connection on port 8080, and waits for a single client. - """ - server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(('0.0.0.0', self.PORT)) - server.listen(1) - client = server.accept()[0] - server.close() - return client - - def send_message(self, client: socket, message: str): - """Send message to client and close the connection.""" - client.send('HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')) - client.close() - - def getRefreshToken(self, scopes: list[str]) -> tuple[praw.Reddit, str]: - state = str(random.randint(0, 65000)) - url = self.redditInstance.auth.url(scopes, state, 'permanent') - print("---Setting up the Reddit API---\n") - print("Go to this URL and login to reddit:\n", url, sep="\n", end="\n\n") - webbrowser.open(url, new=2) - - client = self.recieve_connection() - data = client.recv(1024).decode('utf-8') - str(data) - param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&') - params = {key: value for (key, value) in [token.split('=') for token in param_tokens]} - if state != params['state']: - self.send_message(client, 'State mismatch. Expected: {} Received: {}'.format(state, params['state'])) - raise RedditLoginFailed - if 'error' in params: - self.send_message(client, params['error']) - raise RedditLoginFailed - - refresh_token = self.redditInstance.auth.authorize(params['code']) - self.send_message(client, - "" - ) - return self.redditInstance, refresh_token diff --git a/bulkredditdownloader/searcher.py b/bulkredditdownloader/searcher.py deleted file mode 100644 index 19bf1d3..0000000 --- a/bulkredditdownloader/searcher.py +++ /dev/null @@ -1,341 +0,0 @@ -import sys -import time -import urllib.request -from urllib.error import HTTPError - -from prawcore.exceptions import Forbidden, NotFound - -from bulkredditdownloader.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, - NoPrawSupport) -from bulkredditdownloader.reddit import Reddit -from praw.models.listing.generator import ListingGenerator -from bulkredditdownloader.utils import GLOBAL, createLogFile, printToFile -from praw.models import Submission - -print = printToFile - - -def getPosts(program_mode: dict) -> list[dict]: - """Call PRAW regarding to arguments and pass it to extractDetails. - Return what extractDetails has returned. - """ - reddit = Reddit(GLOBAL.config["credentials"]["reddit"]).begin() - - if program_mode["sort"] == "best": - raise NoPrawSupport("PRAW does not support that") - - if "subreddit" in program_mode: - if "search" in program_mode: - if program_mode["subreddit"] == "frontpage": - program_mode["subreddit"] = "all" - - if "user" in program_mode: - if program_mode["user"] == "me": - program_mode["user"] = str(reddit.user.me()) - - if "search" not in program_mode: - if program_mode["sort"] == "top" or program_mode["sort"] == "controversial": - keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} - # OTHER SORT TYPES DON'T TAKE TIME_FILTER - else: - keyword_params = {"limit": program_mode["limit"]} - else: - keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} - - if "search" in program_mode: - if program_mode["sort"] in ["hot", "rising", "controversial"]: - raise InvalidSortingType("Invalid sorting type has given") - - if "subreddit" in program_mode: - print( - "search for \"{search}\" in\n" - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - search=program_mode["search"], - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"] - ).upper(), no_print=True - ) - return extractDetails( - reddit.subreddit(program_mode["subreddit"]).search( - program_mode["search"], - limit=program_mode["limit"], - sort=program_mode["sort"], - time_filter=program_mode["time"] - ) - ) - - elif "multireddit" in program_mode: - raise NoPrawSupport("PRAW does not support that") - - elif "user" in program_mode: - raise NoPrawSupport("PRAW does not support that") - - elif "saved" in program_mode: - raise ("Reddit does not support that") - - if program_mode["sort"] == "relevance": - raise InvalidSortingType("Invalid sorting type has given") - - if "saved" in program_mode: - print("saved posts\nuser:{username}\nlimit={limit}\n".format( - username=reddit.user.me(), - limit=program_mode["limit"]).upper(), - no_print=True - ) - return extractDetails(reddit.user.me().saved(limit=program_mode["limit"])) - - if "subreddit" in program_mode: - - if program_mode["subreddit"] == "frontpage": - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails(getattr(reddit.front, program_mode["sort"])(**keyword_params)) - - else: - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails( - getattr(reddit.subreddit(program_mode["subreddit"]), program_mode["sort"])(**keyword_params) - ) - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=programMode["limit"], - sort=programMode["sort"], - subreddit=programMode["subreddit"], - time=programMode["time"] - ).upper(), noPrint=True - ) - return extractDetails( - getattr( - reddit.subreddit(programMode["subreddit"]), programMode["sort"] - )(**keyword_params) - ) - - elif "multireddit" in program_mode: - print( - "user: {user}\n" - "multireddit: {multireddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - user=program_mode["user"], - limit=program_mode["limit"], - sort=program_mode["sort"], - multireddit=program_mode["multireddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - try: - return extractDetails( - getattr(reddit.multireddit(program_mode["user"], program_mode["multireddit"]), - program_mode["sort"] - )(**keyword_params) - ) - except NotFound: - raise MultiredditNotFound("Multireddit not found") - - elif "submitted" in program_mode: - print( - "submitted posts of {user}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - user=program_mode["user"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails( - getattr(reddit.redditor(program_mode["user"]).submissions, program_mode["sort"])(**keyword_params) - ) - - elif "upvoted" in program_mode: - print( - "upvoted posts of {user}\nlimit: {limit}\n".format( - user=program_mode["user"], - limit=program_mode["limit"]).upper(), - no_print=True - ) - try: - return extractDetails(reddit.redditor(program_mode["user"]).upvoted(limit=program_mode["limit"])) - except Forbidden: - raise InsufficientPermission( - "You do not have permission to do that") - - elif "post" in program_mode: - print("post: {post}\n".format(post=program_mode["post"]).upper(), no_print=True) - return extractDetails(reddit.submission(url=program_mode["post"]), single_post=True) - - -def extractDetails(posts: (ListingGenerator, Submission), single_post=False) -> list[dict]: - """Check posts and decide if it can be downloaded. - If so, create a dictionary with post details and append them to a list. - Write all of posts to file. Return the list - """ - post_list = [] - post_count = 1 - - all_posts = {} - - print("\nGETTING POSTS") - posts_file = createLogFile("POSTS") - - if single_post: - submission = posts - post_count += 1 - try: - details = {'POSTID': submission.id, - 'TITLE': submission.title, - 'REDDITOR': str(submission.author), - 'TYPE': None, - 'CONTENTURL': submission.url, - 'SUBREDDIT': submission.subreddit.display_name, - 'UPVOTES': submission.score, - 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) - } - except AttributeError: - pass - - if not any( - domain in submission.domain for domain in GLOBAL.arguments.skip_domain): - result = matchWithDownloader(submission) - - if result is not None: - details = {**details, **result} - post_list.append(details) - posts_file.add({post_count: details}) - - else: - try: - for submission in posts: - if post_count % 100 == 0: - sys.stdout.write("• ") - sys.stdout.flush() - - if post_count % 1000 == 0: - sys.stdout.write("\n" + " " * 14) - sys.stdout.flush() - - try: - details = {'POSTID': submission.id, - 'TITLE': submission.title, - 'REDDITOR': str(submission.author), - 'TYPE': None, - 'CONTENTURL': submission.url, - 'SUBREDDIT': submission.subreddit.display_name, - 'UPVOTES': submission.score, - 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) - } - except AttributeError: - continue - - if details['POSTID'] in GLOBAL.downloadedPosts(): - continue - - if not any( - domain in submission.domain for domain in GLOBAL.arguments.skip_domain): - result = matchWithDownloader(submission) - - if result is not None: - details = {**details, **result} - post_list.append(details) - - all_posts[post_count] = details - post_count += 1 - - except KeyboardInterrupt: - print("\nKeyboardInterrupt", no_print=True) - - posts_file.add(all_posts) - - if not len(post_list) == 0: - print() - return post_list - else: - raise NoMatchingSubmissionFound("No matching submission was found") - - -def matchWithDownloader(submission: Submission) -> dict[str, str]: - direct_link = extractDirectLink(submission.url) - if direct_link: - return {'TYPE': 'direct', 'CONTENTURL': direct_link} - - if 'v.redd.it' in submission.domain: - bitrates = ["DASH_1080", "DASH_720", "DASH_600", "DASH_480", "DASH_360", "DASH_240"] - - for bitrate in bitrates: - video_url = submission.url + "/" + bitrate + ".mp4" - - try: - response_code = urllib.request.urlopen(video_url).getcode() - except urllib.error.HTTPError: - response_code = 0 - - if response_code == 200: - return {'TYPE': 'v.redd.it', 'CONTENTURL': video_url} - - if 'gfycat' in submission.domain: - return {'TYPE': 'gfycat'} - - if 'youtube' in submission.domain and 'watch' in submission.url: - return {'TYPE': 'youtube'} - - if 'youtu.be' in submission.domain: - url = urllib.request.urlopen(submission.url).geturl() - if 'watch' in url: - return {'TYPE': 'youtube'} - - elif 'imgur' in submission.domain: - return {'TYPE': 'imgur'} - - elif 'erome' in submission.domain: - return {'TYPE': 'erome'} - - elif 'redgifs' in submission.domain: - return {'TYPE': 'redgifs'} - - elif 'gifdeliverynetwork' in submission.domain: - return {'TYPE': 'gifdeliverynetwork'} - - if 'reddit.com/gallery' in submission.url: - return {'TYPE': 'gallery'} - - elif submission.is_self and 'self' not in GLOBAL.arguments.skip: - return {'TYPE': 'self', - 'CONTENT': submission.selftext} - - -def extractDirectLink(url: str) -> (bool, str): - """Check if link is a direct image link. - If so, return URL, - if not, return False - """ - image_types = ['jpg', 'jpeg', 'png', 'mp4', 'webm', 'gif'] - if url[-1] == "/": - url = url[:-1] - - if "i.reddituploads.com" in url: - return url - - for extension in image_types: - if extension == url.split(".")[-1]: - return url - else: - return None diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 7873db7..b3fb7e6 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -1,105 +1,46 @@ #!/usr/bin/env python3 # coding=utf-8 -import hashlib import logging -import re from abc import ABC, abstractmethod from pathlib import Path import requests +from praw.models import Submission -from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.errors import FailedToDownload +from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) class BaseDownloader(ABC): - def __init__(self, directory: Path, post: dict): + def __init__(self, directory: Path, post: Submission): self.directory = directory self.post = post + self.hashes = [] @abstractmethod - def download(self): + def download(self) -> list[Resource]: raise NotImplementedError - @staticmethod - def _create_hash(content: bytes) -> str: - hash_md5 = hashlib.md5(content) - return hash_md5.hexdigest() - - @staticmethod - def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): - formats = { - "videos": [".mp4", ".webm"], - "images": [".jpg", ".jpeg", ".png", ".bmp"], - "gifs": [".gif"], - "self": [] + def _download_resource(self, resource_url: str): + headers = { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " + "Safari/537.36 OPR/54.0.2952.64", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", + "Accept-Encoding": "none", + "Accept-Language": "en-US,en;q=0.8", + "Connection": "keep-alive", } - - for file_type in GLOBAL.arguments.skip: - for extension in formats[file_type]: - if extension in filename: - raise TypeInSkip - - if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): - raise DomainInSkip - - headers = [ - ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"), - ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), - ("Accept-Encoding", "none"), - ("Accept-Language", "en-US,en;q=0.8"), - ("Connection", "keep-alive") - ] - - folder_dir.mkdir(exist_ok=True) - - if "imgur" not in image_url: - addheaders = headers - else: - addheaders = None - - if not silent: - logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") - # Loop to attempt download 3 times for i in range(3): - file_path = Path(folder_dir) / filename - - if file_path.is_file(): - raise FileAlreadyExistsError - else: - try: - download_content = requests.get(image_url, headers=addheaders).content - except ConnectionResetError: - raise FailedToDownload - - file_hash = BaseDownloader._create_hash(download_content) - if GLOBAL.arguments.no_dupes: - if file_hash in GLOBAL.downloadedPosts(): - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - with open(file_path, 'wb') as file: - file.write(download_content) - if not silent: - logger.info(" " * indent + "Downloaded" + " " * 10) - return + try: + download_content = requests.get(resource_url, headers=headers).content + except ConnectionResetError: + raise FailedToDownload + return Resource(self.post, resource_url, download_content) raise FailedToDownload - - @staticmethod - def _get_extension(url: str) -> str: - pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') - if results := re.search(pattern, url): - if len(results.groups()) > 1: - return results[0] - if "v.redd.it" not in url: - return '.jpg' - else: - return '.mp4' diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index 95ac00f..cb90752 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -2,18 +2,14 @@ import pathlib +from praw.models import Submission + from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.utils import GLOBAL class Direct(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL']) - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL']) + return [self._download_resource(self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 540733f..84ee3c9 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -7,77 +7,39 @@ import urllib.error import urllib.request from html.parser import HTMLParser +from praw.models import Submission + +from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Erome(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): try: - images = self._get_links(self.post['CONTENTURL']) + images = self._get_links(self.post.url) except urllib.error.HTTPError: raise NotADownloadableLinkError("Not a downloadable link") - images_length = len(images) - how_many_downloaded = len(images) - duplicates = 0 - - if images_length == 1: - """Filenames are declared here""" - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] + if len(images) == 1: image = images[0] if not re.match(r'https?://.*', image): image = "https://" + image - - self._download_resource(filename, self.directory, image) + return [self._download_resource(image)] else: - filename = GLOBAL.config['filename'].format(**self.post) - logger.info(filename) - - folder_dir = self.directory / filename - - folder_dir.mkdir(exist_ok=True) - + out = [] for i, image in enumerate(images): - extension = self._get_extension(image) - filename = str(i + 1) + extension - if not re.match(r'https?://.*', image): image = "https://" + image - logger.info(" ({}/{})".format(i + 1, images_length)) - logger.info(" {}".format(filename)) - - try: - self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2) - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - how_many_downloaded -= 1 - - except Exception as exception: - # raise exception - logger.error("\n Could not get the file") - logger.error( - " " - + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)) - + "\n" - ) - how_many_downloaded -= 1 - - if duplicates == images_length: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < images_length: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + out.append(self._download_resource(image)) + return out @staticmethod def _get_links(url: str) -> list[str]: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 59334be..7a4c732 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -1,26 +1,23 @@ #!/usr/bin/env python3 import json -import pathlib import logging -import urllib.parse +import pathlib import requests +from praw.models import Submission +from bulkredditdownloader.errors import ImageNotFound, NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, - NotADownloadableLinkError, TypeInSkip) -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Gallery(BaseDownloader): - def __init__(self, directory: pathlib.Path, post): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - link = self.post['CONTENTURL'] + link = self.post.url self.raw_data = self._get_data(link) - self.download() def download(self): images = {} @@ -37,7 +34,7 @@ class Gallery(BaseDownloader): except KeyError: continue - self._download_album(images, count) + return [self._download_album(images)] @staticmethod def _get_data(link: str) -> dict: @@ -63,44 +60,9 @@ class Gallery(BaseDownloader): data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) return data - def _download_album(self, images: dict, count: int): - folder_name = GLOBAL.config['filename'].format(**self.post) - folder_dir = self.directory / folder_name - - how_many_downloaded = 0 - duplicates = 0 - - folder_dir.mkdir(exist_ok=True) - logger.info(folder_name) - + def _download_album(self, images: dict): + out = [] for i, image in enumerate(images): - path = urllib.parse.urlparse(image['url']).path - extension = pathlib.Path(path).suffix + out.append(self._download_resource(image['url'])) - filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension) - - logger.info("\n ({}/{})".format(i + 1, count)) - - try: - self._download_resource(filename, folder_dir, image['url'], indent=2) - how_many_downloaded += 1 - - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - - except TypeInSkip: - logger.info(" Skipping...") - how_many_downloaded += 1 - - except Exception as exception: - logger.info("\n Could not get the file") - logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exception.__class__.__name__, info=str(exception)) + "\n" - ) - logger.info(GLOBAL.log_stream.getvalue(), no_print=True) - - if duplicates == count: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < count: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + return out diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index bd1d694..1bc442d 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -6,14 +6,14 @@ import re import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Gfycat(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): super().download() diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 85252cb..ba84695 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -4,29 +4,23 @@ import pathlib import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader class GifDeliveryNetwork(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): try: - self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL']) + media_url = self._get_link(self.post.url) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL']) - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - - self._download_resource(filename, self.directory, self.post['MEDIAURL']) + return [self._download_resource(media_url)] @staticmethod def _get_link(url: str) -> str: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index b1c2016..d821121 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -1,16 +1,15 @@ #!/usr/bin/env python3 import json -import pathlib import logging +import pathlib import requests +from praw.models import Submission +from bulkredditdownloader.errors import ExtensionError, ImageNotFound, NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, - ImageNotFound, NotADownloadableLinkError, TypeInSkip) -from bulkredditdownloader.utils import GLOBAL, nameCorrector logger = logging.getLogger(__name__) @@ -19,85 +18,43 @@ class Imgur(BaseDownloader): imgur_image_domain = "https://i.imgur.com/" - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) self.raw_data = {} - self.download() def download(self): - link = self.post['CONTENTURL'] + link = self.post.url if link.endswith(".gifv"): - link = link.replace(".gifv", ".mp4") - Direct(self.directory, {**self.post, 'CONTENTURL': link}) - return + direct_thing = Direct(self.directory, self.post) + return direct_thing.download() self.raw_data = self._get_data(link) if self._is_album: if self.raw_data["album_images"]["count"] != 1: - self._download_album(self.raw_data["album_images"]) + out = self._download_album(self.raw_data["album_images"]) else: - self._download_image(self.raw_data["album_images"]["images"][0]) + out = self._download_image(self.raw_data["album_images"]["images"][0]) else: - self._download_image(self.raw_data) + out = self._download_image(self.raw_data) + return out def _download_album(self, images: dict): - folder_name = GLOBAL.config['filename'].format(**self.post) - folder_dir = self.directory / folder_name - images_length = images["count"] - how_many_downloaded = 0 - duplicates = 0 - folder_dir.mkdir(exist_ok=True) - logger.info(folder_name) + out = [] for i in range(images_length): extension = self._validate_extension(images["images"][i]["ext"]) image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - filename = pathlib.Path("_".join([str(i + 1), - nameCorrector(images["images"][i]['title']), - images["images"][i]['hash']]) + extension) - - logger.info("\n ({}/{})".format(i + 1, images_length)) - - try: - self._download_resource(filename, folder_dir, image_url, indent=2) - how_many_downloaded += 1 - - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - - except TypeInSkip: - logger.info(" Skipping...") - how_many_downloaded += 1 - - except Exception as exception: - logger.info("\n Could not get the file") - logger.info( - " " - + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exception.__class__.__name__, - info=str(exception) - ) - + "\n" - ) - logger.info(GLOBAL.log_stream.getvalue(), no_print=True) - - if duplicates == images_length: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < images_length: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + out.append(self._download_resource(image_url)) + return out def _download_image(self, image: dict): extension = self._validate_extension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension - - filename = GLOBAL.config['filename'].format(**self.post) + extension - - self._download_resource(filename, self.directory, image_url) + return [self._download_resource(image_url)] def _is_album(self) -> bool: return "album_images" in self.raw_data @@ -134,9 +91,8 @@ class Imgur(BaseDownloader): @staticmethod def _validate_extension(extension_suffix: str) -> str: possible_extensions = [".jpg", ".png", ".mp4", ".gif"] - for extension in possible_extensions: if extension in extension_suffix: return extension else: - raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.") + raise ExtensionError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur') diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 2f5f520..2c109d7 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -5,24 +5,22 @@ import pathlib import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Redgifs(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): super().download() @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it""" if '.webm' in url or '.mp4' in url or '.gif' in url: return url diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index c94df7e..05f576e 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -1,64 +1,39 @@ #!/usr/bin/env python3 -import io import logging import pathlib -from pathlib import Path +from praw.models import Submission + +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class SelfPost(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - if "self" in GLOBAL.arguments.skip: - raise TypeInSkip + return Resource(self.post, self.post.url, bytes(self.export_to_string())) - self.directory.mkdir(exist_ok=True) - filename = GLOBAL.config['filename'].format(**self.post) - - file_dir = self.directory / (filename + ".md") - logger.info(file_dir) - logger.info(filename + ".md") - - if Path.is_file(file_dir): - raise FileAlreadyExistsError - - try: - self._write_to_file(file_dir, self.post) - except FileNotFoundError: - file_dir = self.post['POSTID'] + ".md" - file_dir = self.directory / file_dir - - self._write_to_file(file_dir, self.post) - - @staticmethod - def _write_to_file(directory: pathlib.Path, post: dict): + def export_to_string(self) -> str: """Self posts are formatted here""" content = ("## [" - + post["TITLE"] + + self.post.fullname + "](" - + post["CONTENTURL"] + + self.post.url + ")\n" - + post["CONTENT"] + + self.post.selftext + "\n\n---\n\n" + "submitted to [r/" - + post["SUBREDDIT"] + + self.post.subreddit.title + "](https://www.reddit.com/r/" - + post["SUBREDDIT"] + + self.post.subreddit.title + ") by [u/" - + post["REDDITOR"] + + self.post.author.name + "](https://www.reddit.com/user/" - + post["REDDITOR"] + + self.post.author.name + ")") - - with io.open(directory, "w", encoding="utf-8") as FILE: - print(content, file=FILE) - logger.info("Downloaded") + return content diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index 2b4ee03..d13bece 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -4,61 +4,49 @@ import logging import os import pathlib import subprocess +import tempfile +import requests +from praw.models import Submission + +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class VReddit(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - extension = ".mp4" - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + extension - try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) - except Exception: - self._download_resource(filename, self.directory, self.post['CONTENTURL']) - logger.info("FFMPEG library not found, skipping merging video and audio") + except subprocess.SubprocessError: + return self._download_resource(self.post.url) else: - video_name = self.post['POSTID'] + "_video" - video_url = self.post['CONTENTURL'] - audio_name = self.post['POSTID'] + "_audio" + video_url = self.post.url audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' - logger.info(self.directory, filename, sep="\n") - - self._download_resource(video_name, self.directory, video_url, silent=True) - self._download_resource(audio_name, self.directory, audio_url, silent=True) - try: - self._merge_audio(video_name, audio_name, filename, self.directory) - except KeyboardInterrupt: - (self.directory / filename).unlink() - (self.directory / audio_name).unlink() - (self.directory / video_name).unlink() - (self.directory / filename).unlink() + with tempfile.TemporaryDirectory() as temp_dir: + video = requests.get(video_url).content + audio = requests.get(audio_url).content + with open(temp_dir / 'video', 'wb')as file: + file.write(video) + with open(temp_dir / 'audio', 'wb') as file: + file.write(audio) + self._merge_audio(temp_dir) + with open(temp_dir / 'output.mp4', 'rb') as file: + content = file.read() + return Resource(self.post, self.post.url, content) @staticmethod - def _merge_audio( - video: pathlib.Path, - audio: pathlib.Path, - filename: pathlib.Path, - directory: pathlib.Path): - input_video = str(directory / video) - input_audio = str(directory / audio) + def _merge_audio(working_directory: pathlib.Path): + input_video = working_directory / 'video' + input_audio = working_directory / 'audio' fnull = open(os.devnull, 'w') cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format( - input_audio, input_video, str(directory / filename)) + input_audio, input_video, str(working_directory / 'output.mp4')) subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) - - (directory / video).unlink() - (directory / audio).unlink() diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index afabf66..b99b2a1 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -1,64 +1,37 @@ #!/usr/bin/env python3 import logging -import os import pathlib -import sys +import tempfile import youtube_dl +from praw.models import Submission +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import FileAlreadyExistsError -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Youtube(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - self.directory.mkdir(exist_ok=True) + return self._download_video() - filename = GLOBAL.config['filename'].format(**self.post) - logger.info(filename) + def _download_video(self) -> Resource: + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "best", + "outtmpl": str(temp_dir / "test.%(ext)s"), + "playlistend": 1, + "nooverwrites": True, + "quiet": True + } + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download([self.post.url]) - self._download_video(filename, self.directory, self.post['CONTENTURL']) - - def _download_video(self, filename: str, directory: pathlib.Path, url: str): - ydl_opts = { - "format": "best", - "outtmpl": str(directory / (filename + ".%(ext)s")), - "progress_hooks": [self._hook], - "playlistend": 1, - "nooverwrites": True, - "quiet": True - } - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - - location = directory / (filename + ".mp4") - - with open(location, 'rb') as file: - content = file.read() - - if GLOBAL.arguments.no_dupes: - try: - file_hash = self._create_hash(content) - except FileNotFoundError: - return None - if file_hash in GLOBAL.downloadedPosts(): - os.remove(location) - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - @staticmethod - def _hook(d): - if d['status'] == 'finished': - return logger.info("Downloaded") - downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6))) - file_size = int(d['total_bytes'] * (10**(-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size)) - sys.stdout.flush() + with open(temp_dir / 'test.mp4', 'rb') as file: + content = file.read() + return Resource(self.post, self.post.url, content) diff --git a/bulkredditdownloader/store.py b/bulkredditdownloader/store.py index 79cdf43..5aba94e 100644 --- a/bulkredditdownloader/store.py +++ b/bulkredditdownloader/store.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from os import path diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py index 951ef81..41c8335 100644 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ b/bulkredditdownloader/tests/downloaders/test_base_downloader.py @@ -1,42 +1,30 @@ -#!/uasr/bin/env python3 +#!/usr/bin/env python3 # coding=utf-8 from pathlib import Path +from unittest.mock import Mock import pytest +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -@pytest.mark.parametrize(('test_bytes', 'expected'), ((b'test', '098f6bcd4621d373cade4e832627b4f6'), - (b'test2', 'ad0234829205b9033196ba818f7a872b'))) -def test_create_hash(test_bytes: bytes, expected: str): - result = BaseDownloader._create_hash(test_bytes) - assert result == expected +class BlankDownloader(BaseDownloader): + def __init__(self, directory, post): + super().__init__(directory, post) + + def download(self) -> list[Resource]: + return [self._download_resource(self.post.url)] -@pytest.mark.parametrize(('test_url', 'expected'), (('test.png', '.png'), - ('random.jpg', '.jpg'), - ('http://random.com/test.png', '.png'), - ('https://example.net/picture.jpg', '.jpg'), - ('https://v.redd.it/picture', '.mp4'), - ('https://v.redd.it/picture.jpg', '.jpg'), - ('https:/random.url', '.jpg') - )) -def test_get_extension(test_url: str, expected: str): - result = BaseDownloader._get_extension(test_url) - assert result == expected - - -@pytest.mark.skip -@pytest.mark.parametrize(('test_url', 'expected_hash'), (('https://www.iana.org/_img/2013.1/iana-logo-header.svg', ''), - ('', '') - )) -def test_download_resource(test_url: str, expected_hash: str, tmp_path: Path): - test_file = tmp_path / 'test' - BaseDownloader._download_resource(test_file, tmp_path, test_url) - assert test_file.exists() - with open(test_file, 'rb') as file: - content = file.read() - hash_result = BaseDownloader._create_hash(content) - assert hash_result == expected_hash +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'), +)) +def test_get_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + downloader = BlankDownloader(Path('.'), mock_submission) + result = downloader.download() + assert isinstance(result[0], Resource) + assert result[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/utils.py b/bulkredditdownloader/utils.py deleted file mode 100644 index f63f159..0000000 --- a/bulkredditdownloader/utils.py +++ /dev/null @@ -1,90 +0,0 @@ -import io -import sys -from os import makedirs, path -from pathlib import Path -from typing import Optional - -from bulkredditdownloader.json_helper import JsonFile - - -class GLOBAL: - """Declare global variables""" - RUN_TIME = "" - config = {'imgur_client_id': None, 'imgur_client_secret': None} - arguments = None - directory = None - defaultConfigDirectory = Path.home() / "Bulk Downloader for Reddit" - configDirectory = "" - reddit_client_id = "U-6gk4ZCh3IeNQ" - reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg" - printVanilla = print - log_stream = None - - @staticmethod - def downloadedPosts() -> list: - return [] - - -def createLogFile(title: str) -> JsonFile: - """Create a log file with given name - inside a folder time stampt in its name and - put given arguments inside \"HEADER\" key - """ - folder_directory = GLOBAL.directory / "LOG_FILES" / GLOBAL.RUN_TIME - - log_filename = title.upper() + '.json' - - if not path.exists(folder_directory): - makedirs(folder_directory) - - file = JsonFile(folder_directory / Path(log_filename)) - header = " ".join(sys.argv) - file.add({"HEADER": header}) - - return file - - -def printToFile(*args, no_print=False, **kwargs): - """Print to both CONSOLE and - CONSOLE LOG file in a folder time stampt in the name - """ - folder_directory = GLOBAL.directory / Path("LOG_FILES") / Path(GLOBAL.RUN_TIME) - - if not no_print or GLOBAL.arguments.verbose or "file" in kwargs: - print(*args, **kwargs) - - if not path.exists(folder_directory): - makedirs(folder_directory) - - if "file" not in kwargs: - with io.open(folder_directory / "CONSOLE_LOG.txt", "a", encoding="utf-8") as FILE: - print(*args, file=FILE, **kwargs) - - -def nameCorrector(string: str, reference: Optional[str] = None) -> str: - """Swap strange characters from given string - with underscore (_) and shorten it. - Return the string - """ - limit = 247 - string_length = len(string) - - if reference: - reference_length = len(reference) - total_lenght = reference_length - else: - total_lenght = string_length - - if total_lenght > limit: - limit -= reference_length - string = string[:limit - 1] - - string = string.replace(" ", "_") - - if len(string.split('\n')) > 1: - string = "".join(string.split('\n')) - - bad_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '#', '.', '@', '“', '’', '\'', '!'] - string = "".join([i if i not in bad_chars else "_" for i in string]) - - return string diff --git a/setup.py b/setup.py index ab78f46..a8c413f 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,15 @@ #!C:\Users\Ali\AppData\Local\Programs\Python\Python36\python.exe -## python setup.py build +# python setup.py build import sys -from cx_Freeze import setup, Executable + +from cx_Freeze import Executable, setup + from bulkredditdownloader.__main__ import __version__ options = { "build_exe": { - "packages":[ + "packages": [ "idna", "praw", "requests", "multiprocessing" ] } @@ -15,7 +17,7 @@ options = { if sys.platform == "win32": executables = [Executable( - "script.py", + "script.py", targetName="bulk-downloader-for-reddit.exe", shortcutName="Bulk Downloader for Reddit", shortcutDir="DesktopFolder" @@ -23,28 +25,26 @@ if sys.platform == "win32": elif sys.platform == "linux": executables = [Executable( - "script.py", + "script.py", targetName="bulk-downloader-for-reddit", shortcutName="Bulk Downloader for Reddit", shortcutDir="DesktopFolder" )] setup( - name = "Bulk Downloader for Reddit", - version = __version__, - description = "Bulk Downloader for Reddit", - author = "Ali Parlakci", + name="Bulk Downloader for Reddit", + version=__version__, + description="Bulk Downloader for Reddit", + author="Ali Parlakci", author_email="parlakciali@gmail.com", url="https://github.com/aliparlakci/bulk-downloader-for-reddit", classifiers=( - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)" - "Natural Language :: English", - "Environment :: Console", - "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)" + "Natural Language :: English", + "Environment :: Console", + "Operating System :: OS Independent", ), - executables = executables, - options = options + executables=executables, + options=options ) - -